From 67a042b917ca307c428b0a12f08a441b414cd4df Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Fri, 18 Sep 2020 02:38:06 +0800
Subject: [PATCH 01/50] clean code

---
 bertserini/bert_reader.py                  | 249 +++++++++++++++++++++
 bertserini/interactive.py                  |  15 +-
 bertserini/modeling_bert_qa.py             | 104 ---------
 bertserini/modeling_bert_qa_hardem.py      | 113 ----------
 bertserini/pyserini_retriever.py           |  38 ++++
 bertserini/read.py                         |  25 ---
 bertserini/retriever/__init__.py           |   0
 bertserini/retriever/anserini_retriever.py |  47 ----
 bertserini/retriever/pyserini_retriever.py |  37 ---
 bertserini/run_squad.py                    | 235 -------------------
 bertserini/run_squad_train.py              | 160 -------------
 bertserini/search.py                       |  14 +-
 bertserini/utils.py                        |  40 ++--
 bertserini/utils_squad.py                  |   2 +-
 requirements.txt                           |   1 +
 15 files changed, 327 insertions(+), 753 deletions(-)
 create mode 100644 bertserini/bert_reader.py
 delete mode 100644 bertserini/modeling_bert_qa.py
 delete mode 100644 bertserini/modeling_bert_qa_hardem.py
 create mode 100644 bertserini/pyserini_retriever.py
 delete mode 100644 bertserini/read.py
 delete mode 100644 bertserini/retriever/__init__.py
 delete mode 100755 bertserini/retriever/anserini_retriever.py
 delete mode 100644 bertserini/retriever/pyserini_retriever.py
 delete mode 100644 bertserini/run_squad_train.py

diff --git a/bertserini/bert_reader.py b/bertserini/bert_reader.py
new file mode 100644
index 0000000..46e1349
--- /dev/null
+++ b/bertserini/bert_reader.py
@@ -0,0 +1,249 @@
+import os
+import logging
+import torch
+from torch.utils.data import DataLoader, SequentialSampler
+from transformers.data.processors.squad import SquadResult
+from bertserini.run_squad import to_list
+from bertserini.utils_squad import compute_predictions_log_probs, compute_predictions_logits, SquadExample
+from transformers import (
+    AutoModelForQuestionAnswering,
+    AutoTokenizer,
+    squad_convert_examples_to_features,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+class MySquadExample(SquadExample):
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 context_text,
+                 answer_text,
+                 start_position_character,
+                 title,
+                 answers=[],
+                 is_impossible=False,
+                 paragraph_score=0,
+                 chinese=False,
+                 tokenizer=None):
+        super(MySquadExample, self).__init__(
+            qas_id,
+            question_text,
+            context_text,
+            answer_text,
+            start_position_character,
+            title,
+            answers,
+            is_impossible,
+            chinese,
+            tokenizer,
+        )
+        self.paragraph_score = paragraph_score
+
+
+def create_inference_examples(query, paragraphs, paragraph_scores, chinese=False, tokenizer=None):
+    examples = []
+    for (id, paragraph) in enumerate(paragraphs):
+        example = MySquadExample(
+            qas_id=id,
+            question_text=query,
+            context_text=paragraph,
+            answer_text=None,
+            start_position_character=None,
+            title="",
+            is_impossible=False,
+            answers=[],
+            paragraph_score=paragraph_scores[id],
+            chinese=chinese,
+            tokenizer=tokenizer,
+        )
+        id += 1
+        examples.append(example)
+
+    return examples
+
+
+class BertReader:
+    def __init__(self, args):
+        super(BertReader, self).__init__()
+        self.args = args
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+            do_lower_case=args.do_lower_case,
+            cache_dir=args.cache_dir if args.cache_dir else None,
+        )
+        checkpoint = self.args.model_name_or_path
+
+        logger.info("Evaluate the following checkpoints: %s", checkpoint)
+
+        # Reload the model
+        global_step = ""
+        self.model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)  # , force_download=True)
+        self.model = self.model.to(args.device)
+
+        self.model.eval()
+
+    def predict(self, id_, question, paragraph_texts, paragraph_scores):
+        # dataset, examples, features = load_and_cache_examples(self.args, self.tokenizer, evaluate=True, output_examples=True)
+
+        # processor = SquadV2Processor() if self.args.version_2_with_negative else SquadV1Processor()
+        # todo convert to single query examples
+        examples = create_inference_examples(
+            question,
+            paragraph_texts,
+            paragraph_scores,
+            chinese=self.args.chinese,
+            tokenizer=self.tokenizer)
+
+        features, dataset = squad_convert_examples_to_features(
+            examples=examples,
+            tokenizer=self.tokenizer,
+            max_seq_length=self.args.max_seq_length,
+            doc_stride=self.args.doc_stride,
+            max_query_length=self.args.max_query_length,
+            is_training=False,
+            return_dataset="pt",
+            threads=self.args.threads,
+            tqdm_enabled=False
+        )
+
+        # if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+        #     os.makedirs(args.output_dir)
+
+        self.args.eval_batch_size = self.args.per_gpu_eval_batch_size * max(1, self.args.n_gpu)
+
+        # Note that DistributedSampler samples randomly
+        eval_sampler = SequentialSampler(dataset)
+        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.args.eval_batch_size)
+
+        # multi-gpu evaluate
+        if self.args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel):
+            self.model = torch.nn.DataParallel(self.model)
+
+        # Eval!
+        # logger.info("***** Running evaluation {} *****".format(prefix))
+        # logger.info("  Num examples = %d", len(dataset))
+        # logger.info("  Batch size = %d", args.eval_batch_size)
+
+        all_results = []
+        # start_time = timeit.default_timer()
+
+        for batch in eval_dataloader:
+            self.model.eval()
+            batch = tuple(t.to(self.args.device) for t in batch)
+
+            with torch.no_grad():
+                inputs = {
+                    "input_ids": batch[0],
+                    "attention_mask": batch[1],
+                    "token_type_ids": batch[2],
+                }
+
+                # if args.model_type in ["xlm", "roberta", "distilbert", "camembert"]:
+                #     del inputs["token_type_ids"]
+
+                feature_indices = batch[3]
+
+                # XLNet and XLM use more arguments for their predictions
+                # if args.model_type in ["xlnet", "xlm"]:
+                #     inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
+                #     # for lang_id-sensitive xlm models
+                #     if hasattr(model, "config") and hasattr(model.config, "lang2id"):
+                #         inputs.update(
+                #             {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
+                #         )
+
+                outputs = self.model(**inputs)
+
+            for i, feature_index in enumerate(feature_indices):
+                eval_feature = features[feature_index.item()]
+                unique_id = int(eval_feature.unique_id)
+
+                output = [to_list(output[i]) for output in outputs]
+
+                # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
+                # models only use two.
+                if len(output) >= 5:
+                    start_logits = output[0]
+                    start_top_index = output[1]
+                    end_logits = output[2]
+                    end_top_index = output[3]
+                    cls_logits = output[4]
+
+                    result = SquadResult(
+                        unique_id,
+                        start_logits,
+                        end_logits,
+                        start_top_index=start_top_index,
+                        end_top_index=end_top_index,
+                        cls_logits=cls_logits,
+                    )
+
+                else:
+                    start_logits, end_logits = output
+                    result = SquadResult(unique_id, start_logits, end_logits)
+
+                all_results.append(result)
+
+        # Compute predictions
+        prefix = ""
+        output_prediction_file = os.path.join(self.args.output_dir, "predictions_{}.json".format(prefix))
+        output_nbest_file = os.path.join(self.args.output_dir, "nbest_predictions_{}.json".format(prefix))
+
+        if self.args.version_2_with_negative:
+            output_null_log_odds_file = os.path.join(self.args.output_dir, "null_odds_{}.json".format(prefix))
+        else:
+            output_null_log_odds_file = None
+
+        # XLNet and XLM use a more complex post-processing procedure
+        if self.args.model_type in ["xlnet", "xlm"]:
+            start_n_top = self.model.config.start_n_top if hasattr(self.model,
+                                                                   "config") else self.model.module.config.start_n_top
+            end_n_top = self.model.config.end_n_top if hasattr(self.model,
+                                                               "config") else self.model.module.config.end_n_top
+
+            answers, nbest_answers = compute_predictions_log_probs(
+                examples,
+                features,
+                all_results,
+                self.args.n_best_size,
+                self.args.max_answer_length,
+                output_prediction_file,
+                output_nbest_file,
+                output_null_log_odds_file,
+                start_n_top,
+                end_n_top,
+                self.args.version_2_with_negative,
+                self.tokenizer,
+                self.args.verbose_logging,
+                self.args.chinese
+            )
+        else:
+            answers, nbest_answers = compute_predictions_logits(
+                examples,
+                features,
+                all_results,
+                self.args.n_best_size,
+                self.args.max_answer_length,
+                self.args.do_lower_case,
+                output_prediction_file,
+                output_nbest_file,
+                output_null_log_odds_file,
+                self.args.verbose_logging,
+                self.args.version_2_with_negative,
+                self.args.null_score_diff_threshold,
+                self.tokenizer,
+                self.args.chinese
+            )
+
+        all_answers = []
+        for answer_id, ans in enumerate(answers):
+            ans_dict = {"id": id_,
+                        "answer": answers[ans][0],
+                        "phrase_score": answers[ans][1],
+                        "paragraph_score": paragraph_scores[answer_id],
+                        }
+            all_answers.append(ans_dict)
+        return all_answers
diff --git a/bertserini/interactive.py b/bertserini/interactive.py
index 7728132..948870b 100644
--- a/bertserini/interactive.py
+++ b/bertserini/interactive.py
@@ -1,11 +1,6 @@
-import json
-import time
-import unicodedata
-
-from run_squad_new import BertReader
-from bertserini.retriever.anserini_retriever import anserini_retriever, build_searcher
-#from retriever.pyserini_retriever import anserini_retriever, build_searcher
-from bertserini.utils import (convert_squad_to_list, normalize_text, strip_accents, choose_best_answer, weighted_score)
+from bertserini.bert_reader import BertReader
+from bertserini.pyserini_retriever import retriever, build_searcher
+from bertserini.utils import (choose_best_answer, weighted_score)
 
 from bertserini.args import *
 
@@ -20,9 +15,9 @@
         if len(question.strip()) == 0:
             break
         if args.chinese:
-            paragraphs = anserini_retriever(question.encode("utf-8"), ansrini_searcher, args.para_num)
+            paragraphs = retriever(question.encode("utf-8"), ansrini_searcher, args.para_num)
         else:
-            paragraphs = anserini_retriever(question, ansrini_searcher, args.para_num)
+            paragraphs = retriever(question, ansrini_searcher, args.para_num)
         if len(paragraphs) == 0:
             print("No related Wiki passage found")
         paragraph_texts = []
diff --git a/bertserini/modeling_bert_qa.py b/bertserini/modeling_bert_qa.py
deleted file mode 100644
index 43af8b9..0000000
--- a/bertserini/modeling_bert_qa.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch BERT model. """
-
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-import json
-import logging
-import math
-import os
-import sys
-from io import open
-
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss, MSELoss
-
-from pytorch_transformers.modeling_bert import BertPreTrainedModel, BertModel
-
-class BertForQuestionAnswering(BertPreTrainedModel):
-    r"""
-        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
-            Span-start scores (before SoftMax).
-        **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
-            Span-end scores (before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-    Examples::
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        gits
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss, start_scores, end_scores = outputs[:2]
-    """
-    def __init__(self, config):
-        super(BertForQuestionAnswering, self).__init__(config)
-        self.num_labels = config.num_labels
-
-        self.bert = BertModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.apply(self.init_weights)
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
-                end_positions=None, position_ids=None, head_mask=None):
-        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
-                            attention_mask=attention_mask, head_mask=head_mask)
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-
-        outputs = (start_logits, end_logits,) + outputs[2:]
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            outputs = (total_loss,) + outputs
-
-        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
diff --git a/bertserini/modeling_bert_qa_hardem.py b/bertserini/modeling_bert_qa_hardem.py
deleted file mode 100644
index 2cc9aea..0000000
--- a/bertserini/modeling_bert_qa_hardem.py
+++ /dev/null
@@ -1,113 +0,0 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-import json
-import logging
-import math
-import os
-import sys
-from io import open
-
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss, MSELoss
-
-from pytorch_transformers.modeling_bert import BertPreTrainedModel, BertModel
-
-
-class BertForQuestionAnswering(BertPreTrainedModel):
-
-    def __init__(self, config, device, loss_type, variant_id=0, tau=None):
-        super(BertForQuestionAnswering, self).__init__(config)
-        self.bert = BertModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, 2) # [N, L, H] => [N, L, 2]
-        #self.qa_classifier = nn.Linear(config.hidden_size, n_class) # [N, H] => [N, n_class]
-        self.device = device
-        def init_weights(module):
-            if isinstance(module, (nn.Linear, nn.Embedding)):
-                module.weight.data.normal_(mean=0.0, std=config.initializer_range)
-            if isinstance(module, nn.Linear):
-                module.bias.data.zero_()
-
-        self.apply(init_weights)
-        self.loss_type = loss_type
-        self.tau = tau
-        if self.loss_type=='hard-em':
-            assert tau is not None
-
-    def _forward(self, input_ids, attention_mask, token_type_ids):
-        '''
-        each batch is a list of 7 items (training) or 3 items (inference)
-            - input_ids: token id of the input sequence
-            - attention_mask: mask of the sequence (1 for present, 0 for blank)
-            - token_type_ids: indicator of type of sequence.
-            -      e.g. in QA, whether it is question or document
-            - (training) start_positions: list of start positions of the span
-            - (training) end_positions: list of end positions of the span
-            - (training) switch: list of switches (can be used for general purposes.
-            -      in this model, 0 means the answer is span, 1 means the answer is `yes`,
-            -      2 means the answer is `no`, 3 means there's no answer
-            - (training) answer_mask: list of answer mask.
-            -      e.g. if the possible spans are `[0, 7], [3, 7]`, and your `max_n_answers` is 3,
-            -      start_positions: [[0, 3, 0]]
-            -      end_positions: [[7, 7, 0]]
-            -      switch: [[0, 0, 0]]
-            -      answer_mask: [[1, 1, 0]]
-        '''
-        all_encoder_layers = self.bert(input_ids, token_type_ids, attention_mask)
-        sequence_output = all_encoder_layers[0]
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-        #switch_logits = self.qa_classifier(torch.max(sequence_output, 1)[0])
-        return start_logits, end_logits
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
-                end_positions=None, answer_mask=None, position_ids=None, head_mask=None, global_step=-1):
-        start_logits, end_logits = self._forward(input_ids, attention_mask, token_type_ids)
-        if start_positions is not None:
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-            #answer_mask = answer_mask.type(torch.FloatTensor).to(self.device)
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index, reduce=False)
-            # You care about the span only when switch is 0
-            #span_mask = answer_mask * (switch == 0).type(torch.FloatTensor).to(self.device)
-
-            start_losses = [(loss_fct(start_logits, _start_positions) * _answer_mask) \
-                            for (_start_positions, _answer_mask) \
-                            in zip(torch.unbind(start_positions, dim=1), torch.unbind(answer_mask, dim=1))]
-            end_losses = [loss_fct(end_logits, _end_positions) * _answer_mask \
-                            for (_end_positions, _answer_mask) \
-                          in zip(torch.unbind(end_positions, dim=1), torch.unbind(answer_mask, dim=1))]
-            assert len(start_losses) == len(end_losses)
-            loss_tensor = \
-                        torch.cat([t.unsqueeze(1) for t in start_losses], dim=1) + \
-                        torch.cat([t.unsqueeze(1) for t in end_losses], dim=1)
-
-            if self.loss_type=='first-only':
-                total_loss = torch.sum(start_losses[0]+end_losses[0]+switch_losses[0])
-            elif self.loss_type == "hard-em":
-                if numpy.random.random()<min(global_step/self.tau, 0.8):
-                    total_loss = self._take_min(loss_tensor)
-                else:
-                    total_loss = self._take_mml(loss_tensor)
-            elif self.loss_type == "mml":
-                total_loss = self._take_mml(loss_tensor)
-            else:
-                raise NotImplementedError()
-            return total_loss
-
-        elif start_positions is None and end_positions is None:
-            return start_logits, end_logits
-
-        else:
-            raise NotImplementedError()
-
-    def _take_min(self, loss_tensor):
-        return torch.sum(torch.min(
-            loss_tensor + 2*torch.max(loss_tensor)*(loss_tensor==0).type(torch.FloatTensor).to(self.device), 1)[0])
-
-    def _take_mml(self, loss_tensor):
-        return torch.sum(torch.log(torch.sum(torch.exp(
-                loss_tensor - 1e10 * (loss_tensor==0).float()), 1)))
diff --git a/bertserini/pyserini_retriever.py b/bertserini/pyserini_retriever.py
new file mode 100644
index 0000000..8ef2e08
--- /dev/null
+++ b/bertserini/pyserini_retriever.py
@@ -0,0 +1,38 @@
+from pyserini.search import SimpleSearcher
+from bertserini.utils import init_logger
+
+logger = init_logger("retriever")
+
+
+def build_searcher(k1=0.9, b=0.4, index_path="index/lucene-index.wiki_paragraph_drqa.pos+docvectors", chinese=False):
+    searcher = SimpleSearcher(index_path)
+    searcher.set_bm25(k1, b)
+    if chinese:
+        searcher.object.setLanguage("zh")
+        print("########### we are usinig Chinese retriever ##########")
+    return searcher
+
+
+def retriever(question, searcher, para_num=20):
+    try:
+        hits = searcher.search(question, k=para_num)
+    except ValueError as e:
+        logger.error("Search failure: {}, {}".format(question, e))
+        return []
+
+    paragraphs = []
+    for hit in hits:
+        doc_id = hit.docid
+        score = hit.score
+        text = hit.contents
+
+        if ("||" in text) or ("/><" in text) or \
+           ("|----|" in text) or ("#fffff" in text):
+            continue
+        else:
+            paragraph_dict = {'text': text,
+                              'paragraph_score': score,
+                              'docid': doc_id}
+            paragraphs.append(paragraph_dict)
+
+    return paragraphs
diff --git a/bertserini/read.py b/bertserini/read.py
deleted file mode 100644
index 6c5a893..0000000
--- a/bertserini/read.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import json
-from tqdm import trange, tqdm
-
-from bert_reader import BertReader
-from bertserini.args import *
-from bertserini.utils import strip_accents
-
-
-if __name__ == "__main__":
-    QAs = convert_squad_to_list("./data/squad_v1.1/dev-v1.1.json")
-
-    bert_reader = BertReader(args)
-    all_results = []
-    for question_id in trange(len(QAs)):
-        question = strip_accents(QAs[question_id]["question"])
-        paragraph_texts = [QAs[question_id]["context"]]
-        id_ = QAs[question_id]["id"]
-
-        paragraph_scores = [100]
-
-        final_answers = bert_reader.predict(id_, question, paragraph_texts, paragraph_scores)
-        print(question, final_answers)
-
-        all_results.append(final_answers)
-    json.dump(all_results, open("pytorch_bert_squad.json", 'w'))
diff --git a/bertserini/retriever/__init__.py b/bertserini/retriever/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/bertserini/retriever/anserini_retriever.py b/bertserini/retriever/anserini_retriever.py
deleted file mode 100755
index 0c1e6f2..0000000
--- a/bertserini/retriever/anserini_retriever.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import os
-import numpy as np
-
-import jnius_config
-jnius_config.set_classpath("lib/anserini-0.6.0-SNAPSHOT-fatjar.jar")
-
-from jnius import autoclass
-JString = autoclass('java.lang.String')
-JSearcher = autoclass('io.anserini.search.SimpleSearcher')
-
-from bertserini.utils import init_logger, strip_accents, normalize_text
-#logger = init_logger("anserini_retriever")
-
-def build_searcher(k1=0.9, b=0.4, index_path="index/lucene-index.wiki_paragraph_drqa.pos+docvectors", segmented=False, rm3=False, chinese=False):
-    searcher = JSearcher(JString(index_path))
-    searcher.setBM25Similarity(k1, b)
-    if not rm3:
-        searcher.setSearchChinese(chinese)
-        searcher.setDefaultReranker()
-    else:
-        searcher.setRM3Reranker()
-    return searcher
-
-
-def anserini_retriever(question, searcher, para_num=20, tag=""):
-    try:
-        #hits = searcher.search(JString(question), para_num, JString(tag))
-        hits = searcher.search(JString(question.encode("utf-8")), para_num)
-    except ValueError as e:
-        #logger.error("Search failure: {}, {}".format(question, e))
-        print("Search failure: {}, {}".format(question, e))
-        return []
-
-    paragraphs = []
-
-    for paragraph in hits:
-        if ("||" in paragraph.content) or ("/><" in paragraph.content) or \
-           ("|----|" in paragraph.content) or ("#fffff" in paragraph.content):
-            continue
-        else:
-            paragraph_dict = {'text': paragraph.content,
-                              'paragraph_score': paragraph.score,
-                              'docid': paragraph.docid}
-                              #"tag": paragraph.tag}
-            paragraphs.append(paragraph_dict)
-
-    return paragraphs
diff --git a/bertserini/retriever/pyserini_retriever.py b/bertserini/retriever/pyserini_retriever.py
deleted file mode 100644
index eb36179..0000000
--- a/bertserini/retriever/pyserini_retriever.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import os
-import numpy as np
-
-from pyserini.search import SimpleSearcher
-
-from bertserini.utils import init_logger, strip_accents, normalize_text
-logger = init_logger("anserini_retriever")
-
-def build_searcher(k1=0.9, b=0.4, index_path="index/lucene-index.wiki_paragraph_drqa.pos+docvectors", segmented=False, rm3=False, chinese=False):
-    searcher = SimpleSearcher(index_path)
-    searcher.set_bm25(k1, b)
-    if chinese:
-        searcher.object.setLanguage("zh")
-        print("########### we are usinig Chinese retriever ##########")
-    return searcher
-
-def anserini_retriever(question, searcher, para_num=20, tag=""):
-    try:
-        hits = searcher.search(question, k=para_num)
-    except ValueError as e:
-        logger.error("Search failure: {}, {}".format(question, e))
-        return []
-
-    paragraphs = []
-
-    for paragraph in hits:
-        if ("||" in paragraph.raw) or ("/><" in paragraph.raw) or \
-           ("|----|" in paragraph.raw) or ("#fffff" in paragraph.raw):
-            continue
-        else:
-            paragraph_dict = {'text': paragraph.raw,
-                              'paragraph_score': paragraph.score,
-                              'docid': paragraph.docid}
-                              #"tag": paragraph.tag}
-            paragraphs.append(paragraph_dict)
-
-    return paragraphs
diff --git a/bertserini/run_squad.py b/bertserini/run_squad.py
index f345b28..782dae0 100644
--- a/bertserini/run_squad.py
+++ b/bertserini/run_squad.py
@@ -52,7 +52,6 @@
 except ImportError:
     from tensorboardX import SummaryWriter
 
-from bertserini.utils_squad import compute_predictions_log_probs, compute_predictions_logits, SquadExample
 # from args import *
 
 logger = logging.getLogger(__name__)
@@ -401,28 +400,6 @@ def evaluate(args, model, tokenizer, prefix=""):
     results = squad_evaluate(examples, predictions)
     return results
 
-
-def create_inference_examples(query, paragraphs, paragraph_scores, chinese=False, tokenizer=None):
-    examples = []
-    for (id, paragraph) in enumerate(paragraphs):
-        example = MySquadExample(
-            qas_id=id,
-            question_text=query,
-            context_text=paragraph,
-            answer_text=None,
-            start_position_character=None,
-            title="",
-            is_impossible=False,
-            answers=[],
-            paragraph_score=paragraph_scores[id],
-            chinese=chinese,
-            tokenizer=tokenizer,
-        )
-        id += 1
-        examples.append(example)
-
-    return examples
-
 def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
     if args.local_rank not in [-1, 0] and not evaluate:
         # Make sure only the first process in distributed training process the dataset, and the others will use the cache
@@ -492,218 +469,6 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
         return dataset, examples, features
     return dataset
 
-class MySquadExample(SquadExample):
-    def __init__(self,
-        qas_id,
-        question_text,
-        context_text,
-        answer_text,
-        start_position_character,
-        title,
-        answers=[],
-        is_impossible=False,
-        paragraph_score=0,
-        chinese=False,
-        tokenizer=None):
-
-        super(MySquadExample, self).__init__(
-            qas_id,
-            question_text,
-            context_text,
-            answer_text,
-            start_position_character,
-            title,
-            answers,
-            is_impossible,
-            chinese,
-            tokenizer,
-        )
-        self.paragraph_score = paragraph_score
-
-
-class BertReader:
-    def __init__(self, args):
-        super(BertReader, self).__init__()
-        self.args = args
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-            do_lower_case=args.do_lower_case,
-            cache_dir=args.cache_dir if args.cache_dir else None,
-        )
-        checkpoint = self.args.model_name_or_path
-
-        logger.info("Evaluate the following checkpoints: %s", checkpoint)
-
-        # Reload the model
-        global_step = ""
-        self.model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)  # , force_download=True)
-        self.model = self.model.to(args.device)
-
-        self.model.eval()
-
-
-    def predict(self, id_, question, paragraph_texts, paragraph_scores):
-
-
-        # dataset, examples, features = load_and_cache_examples(self.args, self.tokenizer, evaluate=True, output_examples=True)
-
-        # processor = SquadV2Processor() if self.args.version_2_with_negative else SquadV1Processor()
-        # todo convert to single query examples
-        examples = create_inference_examples(
-            question,
-            paragraph_texts,
-            paragraph_scores,
-            chinese=self.args.chinese,
-            tokenizer=self.tokenizer)
-
-        features, dataset = squad_convert_examples_to_features(
-            examples=examples,
-            tokenizer=self.tokenizer,
-            max_seq_length=self.args.max_seq_length,
-            doc_stride=self.args.doc_stride,
-            max_query_length=self.args.max_query_length,
-            is_training=not evaluate,
-            return_dataset="pt",
-            threads=self.args.threads,
-            tqdm_enabled=False
-        )
-
-        # if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-        #     os.makedirs(args.output_dir)
-
-        self.args.eval_batch_size = self.args.per_gpu_eval_batch_size * max(1, self.args.n_gpu)
-
-        # Note that DistributedSampler samples randomly
-        eval_sampler = SequentialSampler(dataset)
-        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.args.eval_batch_size)
-
-        # multi-gpu evaluate
-        if self.args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel):
-            self.model = torch.nn.DataParallel(self.model)
-
-        # Eval!
-        # logger.info("***** Running evaluation {} *****".format(prefix))
-        # logger.info("  Num examples = %d", len(dataset))
-        # logger.info("  Batch size = %d", args.eval_batch_size)
-
-        all_results = []
-        # start_time = timeit.default_timer()
-
-        for batch in eval_dataloader:
-            self.model.eval()
-            batch = tuple(t.to(self.args.device) for t in batch)
-
-            with torch.no_grad():
-                inputs = {
-                    "input_ids": batch[0],
-                    "attention_mask": batch[1],
-                    "token_type_ids": batch[2],
-                }
-
-                # if args.model_type in ["xlm", "roberta", "distilbert", "camembert"]:
-                #     del inputs["token_type_ids"]
-
-                feature_indices = batch[3]
-
-                # XLNet and XLM use more arguments for their predictions
-                # if args.model_type in ["xlnet", "xlm"]:
-                #     inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
-                #     # for lang_id-sensitive xlm models
-                #     if hasattr(model, "config") and hasattr(model.config, "lang2id"):
-                #         inputs.update(
-                #             {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
-                #         )
-
-                outputs = self.model(**inputs)
-
-            for i, feature_index in enumerate(feature_indices):
-                eval_feature = features[feature_index.item()]
-                unique_id = int(eval_feature.unique_id)
-
-                output = [to_list(output[i]) for output in outputs]
-
-                # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
-                # models only use two.
-                if len(output) >= 5:
-                    start_logits = output[0]
-                    start_top_index = output[1]
-                    end_logits = output[2]
-                    end_top_index = output[3]
-                    cls_logits = output[4]
-
-                    result = SquadResult(
-                        unique_id,
-                        start_logits,
-                        end_logits,
-                        start_top_index=start_top_index,
-                        end_top_index=end_top_index,
-                        cls_logits=cls_logits,
-                    )
-
-                else:
-                    start_logits, end_logits = output
-                    result = SquadResult(unique_id, start_logits, end_logits)
-
-                all_results.append(result)
-
-        # Compute predictions
-        prefix = ""
-        output_prediction_file = os.path.join(self.args.output_dir, "predictions_{}.json".format(prefix))
-        output_nbest_file = os.path.join(self.args.output_dir, "nbest_predictions_{}.json".format(prefix))
-
-        if self.args.version_2_with_negative:
-            output_null_log_odds_file = os.path.join(self.args.output_dir, "null_odds_{}.json".format(prefix))
-        else:
-            output_null_log_odds_file = None
-
-        # XLNet and XLM use a more complex post-processing procedure
-        if self.args.model_type in ["xlnet", "xlm"]:
-            start_n_top = self.model.config.start_n_top if hasattr(self.model, "config") else self.model.module.config.start_n_top
-            end_n_top = self.model.config.end_n_top if hasattr(self.model, "config") else self.model.module.config.end_n_top
-
-            answers, nbest_answers = compute_predictions_log_probs(
-                examples,
-                features,
-                all_results,
-                self.args.n_best_size,
-                self.args.max_answer_length,
-                output_prediction_file,
-                output_nbest_file,
-                output_null_log_odds_file,
-                start_n_top,
-                end_n_top,
-                self.args.version_2_with_negative,
-                self.tokenizer,
-                self.args.verbose_logging,
-                self.args.chinese
-            )
-        else:
-            answers, nbest_answers = compute_predictions_logits(
-                examples,
-                features,
-                all_results,
-                self.args.n_best_size,
-                self.args.max_answer_length,
-                self.args.do_lower_case,
-                output_prediction_file,
-                output_nbest_file,
-                output_null_log_odds_file,
-                self.args.verbose_logging,
-                self.args.version_2_with_negative,
-                self.args.null_score_diff_threshold,
-                self.tokenizer,
-                self.args.chinese
-            )
-
-        all_answers = []
-        for answer_id, ans in enumerate(answers):
-            ans_dict = {"id": id_,
-                        "answer": answers[ans][0],
-                        "phrase_score": answers[ans][1],
-                        "paragraph_score": paragraph_scores[answer_id],
-                        }
-            all_answers.append(ans_dict)
-        return all_answers
 
 def main():
     parser = argparse.ArgumentParser()
diff --git a/bertserini/run_squad_train.py b/bertserini/run_squad_train.py
deleted file mode 100644
index ac3cf9d..0000000
--- a/bertserini/run_squad_train.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Fine-tuning the library models for question-answering."""
-
-
-import logging
-import os
-import sys
-from dataclasses import dataclass, field
-from typing import Optional
-
-from transformers import AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer, HfArgumentParser, SquadDataset
-from transformers import SquadDataTrainingArguments as DataTrainingArguments
-from transformers import Trainer, TrainingArguments
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    use_fast: bool = field(default=False, metadata={"help": "Set this flag to use fast tokenization."})
-    # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
-    # or just modify its tokenizer_config.json.
-    cache_dir: Optional[str] = field(
-        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
-    )
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    if (
-        os.path.exists(training_args.output_dir)
-        and os.listdir(training_args.output_dir)
-        and training_args.do_train
-        and not training_args.overwrite_output_dir
-    ):
-        raise ValueError(
-            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
-        )
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        training_args.local_rank,
-        training_args.device,
-        training_args.n_gpu,
-        bool(training_args.local_rank != -1),
-        training_args.fp16,
-    )
-    logger.info("Training/evaluation parameters %s", training_args)
-
-    # Prepare Question-Answering task
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-    )
-    model = AutoModelForQuestionAnswering.from_pretrained(
-        model_args.model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.model_name_or_path),
-        config=config,
-        cache_dir=model_args.cache_dir,
-    )
-
-    # Get datasets
-    is_language_sensitive = hasattr(model.config, "lang2id")
-    train_dataset = (
-        SquadDataset(
-            data_args, tokenizer=tokenizer, is_language_sensitive=is_language_sensitive, cache_dir=model_args.cache_dir
-        )
-        if training_args.do_train
-        else None
-    )
-    eval_dataset = (
-        SquadDataset(
-            data_args,
-            tokenizer=tokenizer,
-            mode="dev",
-            is_language_sensitive=is_language_sensitive,
-            cache_dir=model_args.cache_dir,
-        )
-        if training_args.do_eval
-        else None
-    )
-
-    # Initialize our Trainer
-    trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,)
-
-    # Training
-    if training_args.do_train:
-        trainer.train(
-            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
-        )
-        trainer.save_model()
-        # For convenience, we also re-save the tokenizer to the same directory,
-        # so that you can share your model easily on huggingface.co/models =)
-        if trainer.is_world_master():
-            tokenizer.save_pretrained(training_args.output_dir)
-
-
-def _mp_fn(index):
-    # For xla_spawn (TPUs)
-    main()
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/bertserini/search.py b/bertserini/search.py
index c698488..7aa77bb 100644
--- a/bertserini/search.py
+++ b/bertserini/search.py
@@ -1,14 +1,12 @@
 import json
 import time
-import unicodedata
-from tqdm import trange, tqdm
+from tqdm import trange
 
 from hanziconv import HanziConv
 
-from bertserini.run_squad import BertReader
-#from retriever.anserini_retriever import anserini_retriever, build_searcher
-from bertserini.retriever.pyserini_retriever import anserini_retriever, build_searcher
-from bertserini.utils import (convert_squad_to_list, normalize_text, init_logger, strip_accents)
+from bertserini.bert_reader import BertReader
+from bertserini.pyserini_retriever import retriever, build_searcher
+from bertserini.utils import (convert_squad_to_list, normalize_text, strip_accents)
 
 from bertserini.args import *
 
@@ -37,9 +35,9 @@
         if args.chinese:
             if args.toSimplified:
                 question = HanziConv.toSimplified(question)
-            paragraphs = anserini_retriever(question, ansrini_searcher, args.para_num)
+            paragraphs = retriever(question, ansrini_searcher, args.para_num)
         else:
-            paragraphs = anserini_retriever(question, ansrini_searcher, args.para_num)
+            paragraphs = retriever(question, ansrini_searcher, args.para_num)
         if len(paragraphs) == 0:
             continue
         paragraph_texts = []
diff --git a/bertserini/utils.py b/bertserini/utils.py
index a3e4fb0..5a3d2e4 100644
--- a/bertserini/utils.py
+++ b/bertserini/utils.py
@@ -3,23 +3,28 @@
 import logging
 import json
 import re
+import zhon
+import numpy as np
+
 
 def strip_accents(text):
     return "".join(char for char in unicodedata.normalize('NFKD', text)
                    if unicodedata.category(char) != 'Mn')
 
+
 def choose_best_answer(final_answers, score_computer,
                        paragraph_score_weight, phrase_score_weight, mode="origin"):
-
     scored_answers = get_voted_answers(final_answers, score_computer,
-                                        paragraph_score_weight, phrase_score_weight, mode)
+                                       paragraph_score_weight, phrase_score_weight, mode)
     sorted_answers = sorted(scored_answers, key=lambda k: k['total_score'], reverse=True)
 
     return sorted_answers[0]
 
+
 def weighted_score(paragraph_score, phrase_score, paragraph_weight=0.5, phrase_weight=0.5):
     return paragraph_score * paragraph_weight + phrase_score * phrase_weight
 
+
 def get_type(sent):
     ts = ['Who', 'Why', 'What', 'Which', 'When', 'How', 'Where']
     tp = 'others'
@@ -31,6 +36,7 @@ def get_type(sent):
                 return tp
     return tp
 
+
 def get_voted_answers(answerlist, score_computer, paragraph_score_weight, phrase_score_weight, mode="origin"):
     if mode == "origin":
         return get_scored_answers(answerlist, score_computer, paragraph_score_weight, phrase_score_weight)
@@ -38,10 +44,10 @@ def get_voted_answers(answerlist, score_computer, paragraph_score_weight, phrase
         return get_scored_answers(answerlist[0], score_computer, paragraph_score_weight, phrase_score_weight)
     else:
         answer_dict = {}
-        #base_ans = get_scored_answers(answerlist[0], score_computer, paragraph_score_weight, phrase_score_weight)
-        #print(base_ans)
+        # base_ans = get_scored_answers(answerlist[0], score_computer, paragraph_score_weight, phrase_score_weight)
+        # print(base_ans)
         answerlist = answerlist
-        #answerlist = [answerlist[0]]
+        # answerlist = [answerlist[0]]
         answers = get_scored_answers(answerlist[0], score_computer, paragraph_score_weight, phrase_score_weight)
         for ans in answers:
             answer_text = ans['answer']
@@ -49,11 +55,11 @@ def get_voted_answers(answerlist, score_computer, paragraph_score_weight, phrase
             answer_score = ans['total_score']
             if answer_text not in answer_dict:
                 answer_dict[answer_text] = {
-                        "count": 1,
-                        "total_scores": [1 * answer_score],
-                        "sentences": [answer_sentence],
-                        "answer_text": [answer_text]
-                        }
+                    "count": 1,
+                    "total_scores": [1 * answer_score],
+                    "sentences": [answer_sentence],
+                    "answer_text": [answer_text]
+                }
             else:
                 answer_dict[answer_text]['count'] += 1
                 answer_dict[answer_text]['total_scores'].append(1 * answer_score)
@@ -74,11 +80,11 @@ def get_voted_answers(answerlist, score_computer, paragraph_score_weight, phrase
 
         return combined_answers
 
+
 def get_scored_answers(final_answers, score_computer, paragraph_score_weight, phrase_score_weight):
-    
     scored_answers = []
     for ans_id, ans in enumerate(final_answers):
-        #print(ans)
+        # print(ans)
         paragraph_score = ans['paragraph_score']
         phrase_score = ans['phrase_score']
         total_score = score_computer(paragraph_score, phrase_score, paragraph_score_weight, phrase_score_weight)
@@ -88,6 +94,7 @@ def get_scored_answers(final_answers, score_computer, paragraph_score_weight, ph
         # break
     return scored_answers
 
+
 def convert_squad_to_list(squad_filename):
     data = json.load(open(squad_filename, 'r'))
     data = data["data"]
@@ -102,6 +109,7 @@ def convert_squad_to_list(squad_filename):
                 converted_data.append({"id": id_, "question": question, "answers": answers, "context": text})
     return converted_data
 
+
 def init_logger(bot):
     # create logger with 'spam_application'
     # bot = 'server_cn' if args.chinese else 'server_en'
@@ -122,14 +130,17 @@ def init_logger(bot):
     logger.addHandler(ch)
     return logger
 
+
 def split_title(paragraph):
     sents = paragraph.split(".")
-    text =  ".".join(sents[1:]).strip()
+    text = ".".join(sents[1:]).strip()
     title = sents[0].strip()
     return title, text
 
+
 def normalize_answer(s):
     """Lower text and remove punctuation, articles and extra whitespace."""
+
     def remove_articles(text):
         return re.sub(r'\b(a|an|the)\b', ' ', text)
 
@@ -145,6 +156,7 @@ def lower(text):
 
     return white_space_fix(remove_articles(remove_punc(lower(s))))
 
+
 def normalize_text(s):
     def remove_punc(text):
         exclude = set(string.punctuation)
@@ -155,8 +167,10 @@ def lower(text):
 
     return remove_punc(lower(s))
 
+
 def normalize_chinese_text(s):
     def remove_punc(text):
         exclude = set(zhon.hanzi.punctuation)
         return ''.join(ch for ch in text if ch not in exclude)
+
     return remove_punc(s)
diff --git a/bertserini/utils_squad.py b/bertserini/utils_squad.py
index 543b577..10128b4 100644
--- a/bertserini/utils_squad.py
+++ b/bertserini/utils_squad.py
@@ -800,7 +800,7 @@ def compute_predictions_log_probs(
             else:
                 do_lower_case = tokenizer.do_lowercase_and_remove_accent
 
-            final_text = get_final_text(tok_text, orig_text, do_lower_case, tokenizer, chenese, verbose_logging)
+            final_text = get_final_text(tok_text, orig_text, do_lower_case, tokenizer, chinese, verbose_logging)
 
             if final_text in seen_predictions:
                 continue
diff --git a/requirements.txt b/requirements.txt
index 0508bc5..3ffca18 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+numpy
 pyserini
 transformers
 torch

From a42911ae2762c6243474aa902c3a7311a929a250 Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Sun, 20 Sep 2020 07:09:36 +0800
Subject: [PATCH 02/50] temp

---
 bertserini/base.py               | 147 +++++++++++++++++++++
 bertserini/interactive.py        |  45 ++-----
 bertserini/pyserini_retriever.py |  15 ++-
 bertserini/readers.py            | 112 ++++++++++++++++
 bertserini/utils_squad.py        | 215 ++-----------------------------
 5 files changed, 290 insertions(+), 244 deletions(-)
 create mode 100644 bertserini/base.py
 create mode 100644 bertserini/readers.py

diff --git a/bertserini/base.py b/bertserini/base.py
new file mode 100644
index 0000000..d245d1d
--- /dev/null
+++ b/bertserini/base.py
@@ -0,0 +1,147 @@
+from typing import List, Union, Optional, Mapping, Any
+import abc
+
+from pyserini.search import JSimpleSearcherResult
+
+
+__all__ = ['Question', 'Context', 'Reader', 'Answer', 'hits_to_contexts', 'TextType']
+
+
+TextType = Union['Question', 'Context', 'Answer']
+
+
+class Question:
+    """
+    Class representing a question.
+    A question contains the question text itself and potentially other metadata.
+    Parameters
+    ----------
+    text : str
+        The question text.
+    id : Optional[str]
+        The question id.
+    """
+    def __init__(self, text: str, id: Optional[str] = None, language: str = "en"):
+        self.text = text
+        self.id = id
+        self.language = language
+
+
+class Context:
+    """
+    Class representing a Context to find answer from.
+    A text is unspecified with respect to it length; in principle, it
+    could be a full-length document, a paragraph-sized passage, or
+    even a short phrase.
+    Parameters
+    ----------
+    text : str
+        The context that contains potential answer.
+    metadata : Mapping[str, Any]
+        Additional metadata and other annotations.
+    score : Optional[float]
+        The score of the context. For example, the score might be the BM25 score
+        from an initial retrieval stage.
+    """
+
+    def __init__(self,
+                 text: str,
+                 language: str = "en",
+                 metadata: Mapping[str, Any] = None,
+                 score: Optional[float] = 0):
+        self.text = text
+        self.language = language
+        if metadata is None:
+            metadata = dict()
+        self.metadata = metadata
+        self.score = score
+
+
+class Answer:
+    """
+    Class representing an answer.
+    A answer contains the answer text itself and potentially other metadata.
+    Parameters
+    ----------
+    text : str
+        The answer text.
+    metadata : Mapping[str, Any]
+        Additional metadata and other annotations.
+    score : Optional[float]
+        The score of the answer.
+    ctx_score : Optional[float]
+        The context score of the answer.
+    total_score : Optional[float]
+        The aggregated score of answer score and ctx_score
+    """
+    def __init__(self,
+                 text: str,
+                 language: str = "en",
+                 metadata: Mapping[str, Any] = None,
+                 score: Optional[float] = 0,
+                 ctx_score: Optional[float] = 0,
+                 total_score: Optional[float] = 0):
+        self.text = text
+        self.language = language
+        if metadata is None:
+            metadata = dict()
+        self.metadata = metadata
+        self.score = score
+        self.ctx_score = ctx_score
+        self.total_score = total_score
+
+    def aggregate_score(self, weight):
+        self.total_score = weight*self.score + (1-weight)*self.ctx_score
+
+
+
+class Reader:
+    """
+    Class representing a Reader.
+    A Reader takes a list Contexts and returns a list of Answer.
+    """
+    @abc.abstractmethod
+    def predict(self, query: Question, texts: List[Context]) -> List[Answer]:
+        """
+            Find answers from a list of Contexts with respect to a question.
+            Parameters
+            ----------
+            query : Question
+                The question.
+            texts : List[Context]
+                The list of context.
+            Returns
+            -------
+            List[Answer]
+                Predicted list of answer.
+        """
+        pass
+
+
+def hits_to_contexts(hits: List[JSimpleSearcherResult], field='raw', language="en", blacklist=[]) -> List[Context]:
+    """
+        Converts hits from Pyserini into a list of texts.
+        Parameters
+        ----------
+        hits : List[JSimpleSearcherResult]
+            The hits.
+        field : str
+            Field to use.
+        language : str
+            Language of corpus
+        blacklist : List[str]
+            strings that should not contained
+        Returns
+        -------
+        List[Text]
+            List of texts.
+     """
+    contexts = []
+    for i in range(0, len(hits)):
+        t = hits[i].raw if field == 'raw' else hits[i].contents
+        for s in blacklist:
+            if s in t:
+                continue
+        metadata = {'raw': hits[i].raw, 'docid': hits[i].docid}
+        contexts.append(Context(t, language, metadata, hits[i].score))
+    return contexts
diff --git a/bertserini/interactive.py b/bertserini/interactive.py
index 948870b..2d8df4a 100644
--- a/bertserini/interactive.py
+++ b/bertserini/interactive.py
@@ -1,38 +1,21 @@
-from bertserini.bert_reader import BertReader
-from bertserini.pyserini_retriever import retriever, build_searcher
-from bertserini.utils import (choose_best_answer, weighted_score)
-
-from bertserini.args import *
+from .base import Question
+from .readers import BERT
+from .pyserini_retriever import retriever, build_searcher
 
 if __name__ == "__main__":
 
-    bert_reader = BertReader(args)
-    ansrini_searcher = build_searcher(args.k1, args.b, args.index_path, args.rm3, chinese=args.chinese)
+    bert_reader = BERT("rsvp-ai/bertserini-bert-base-squad", "rsvp-ai/bertserini-bert-base-squad")
+    searcher = build_searcher("index/lucene-index.enwiki-20180701-paragraphs")
 
     while True:
         print("Please input your question[use empty line to exit]:")
-        question = input()
-        if len(question.strip()) == 0:
-            break
-        if args.chinese:
-            paragraphs = retriever(question.encode("utf-8"), ansrini_searcher, args.para_num)
-        else:
-            paragraphs = retriever(question, ansrini_searcher, args.para_num)
-        if len(paragraphs) == 0:
-            print("No related Wiki passage found")
-        paragraph_texts = []
-        paragraph_scores = []
-        for paragraph_id, paragraph in enumerate(paragraphs):
-            paragraph_texts.append(paragraph['text'])
-            paragraph_scores.append(paragraph['paragraph_score'])
-        #print(paragraph_texts[:3])
-        
-        final_answers = bert_reader.predict(0, question, paragraph_texts, paragraph_scores)
-        mu = 0.45
-        best_answer = choose_best_answer(final_answers, weighted_score, 1-mu, mu)
-        #print(final_answers)
-        #print(best_answer)
-        #{'id': 0, 'answer': '1982', 'phrase_score': 14.186316013336182, 'paragraph_score': 9.100600242614746, 'total_score': 11.389172339439394}
-        print("Answer:{}\tTotal Score:{:.2f}\tParagraph Score:{:.2f}\tPhrase Score:{:.2f}".format(
-            best_answer["answer"], best_answer["total_score"], best_answer["paragraph_score"], best_answer["phrase_score"]))
+        question = Question(input())
+        contexts = retriever(question, searcher, 10)
+        answers = bert_reader.predict(question, contexts)
+        for ans in answers:
+            ans.aggregate_score(0.45)
+        answers.sort(key=lambda x: x.total_score, reverse=True)
+        print(answers[0].text)
+
+
 
diff --git a/bertserini/pyserini_retriever.py b/bertserini/pyserini_retriever.py
index 8ef2e08..1d3fd77 100644
--- a/bertserini/pyserini_retriever.py
+++ b/bertserini/pyserini_retriever.py
@@ -1,15 +1,14 @@
 from pyserini.search import SimpleSearcher
-from bertserini.utils import init_logger
+from .utils import init_logger
+from .base import hits_to_contexts
 
 logger = init_logger("retriever")
 
 
-def build_searcher(k1=0.9, b=0.4, index_path="index/lucene-index.wiki_paragraph_drqa.pos+docvectors", chinese=False):
+def build_searcher(index_path, k1=0.9, b=0.4, language="en"):
     searcher = SimpleSearcher(index_path)
     searcher.set_bm25(k1, b)
-    if chinese:
-        searcher.object.setLanguage("zh")
-        print("########### we are usinig Chinese retriever ##########")
+    searcher.object.setLanguage(language=language)
     return searcher
 
 
@@ -20,11 +19,12 @@ def retriever(question, searcher, para_num=20):
         logger.error("Search failure: {}, {}".format(question, e))
         return []
 
+    """
     paragraphs = []
     for hit in hits:
         doc_id = hit.docid
         score = hit.score
-        text = hit.contents
+        text = hit.raw
 
         if ("||" in text) or ("/><" in text) or \
            ("|----|" in text) or ("#fffff" in text):
@@ -34,5 +34,6 @@ def retriever(question, searcher, para_num=20):
                               'paragraph_score': score,
                               'docid': doc_id}
             paragraphs.append(paragraph_dict)
+            """
 
-    return paragraphs
+    return hits_to_contexts(hits)
diff --git a/bertserini/readers.py b/bertserini/readers.py
new file mode 100644
index 0000000..ccdd415
--- /dev/null
+++ b/bertserini/readers.py
@@ -0,0 +1,112 @@
+from typing import List
+
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering, squad_convert_examples_to_features
+from torch.utils.data import DataLoader, SequentialSampler
+import torch
+from transformers.data.processors.squad import SquadResult
+
+from .base import Reader, Question, Context, Answer
+
+__all__ = ['BERT']
+
+from .run_squad import to_list
+
+from .utils_squad import SquadExample, compute_predictions_logits
+
+
+def craft_squad_examples(question: Question, contexts: List[Context]) -> List[SquadExample]:
+    examples = []
+    for idx, ctx in enumerate(contexts):
+        examples.append(
+            SquadExample(
+                qas_id=idx,
+                question_text=question.text,
+                context_text=ctx.text,
+                answer_text=None,
+                start_position_character=None,
+                title="",
+                is_impossible=False,
+                answers=[],
+                language=ctx.language
+            )
+        )
+    return examples
+
+
+class BERT(Reader):
+    def __init__(self, model_name: str, tokenizer_name: str):
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.model = AutoModelForQuestionAnswering.from_pretrained(model_name).to(self.device).eval()
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=True)
+
+    def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
+        examples = craft_squad_examples(question, contexts)
+
+        features, dataset = squad_convert_examples_to_features(
+            examples=examples,
+            tokenizer=self.tokenizer,
+            max_seq_length=384,
+            doc_stride=128,
+            max_query_length=64,
+            is_training=False,
+            return_dataset="pt",
+            threads=1,
+            tqdm_enabled=False
+        )
+
+        # Note that DistributedSampler samples randomly
+        eval_sampler = SequentialSampler(dataset)
+        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=32)
+
+        all_results = []
+
+        for batch in eval_dataloader:
+            self.model.eval()
+            batch = tuple(t.to(self.device) for t in batch)
+            with torch.no_grad():
+                inputs = {
+                    "input_ids": batch[0],
+                    "attention_mask": batch[1],
+                    "token_type_ids": batch[2],
+                }
+                feature_indices = batch[3]
+                outputs = self.model(**inputs)
+
+            for i, feature_index in enumerate(feature_indices):
+                eval_feature = features[feature_index.item()]
+                unique_id = int(eval_feature.unique_id)
+
+                output = [to_list(output[i]) for output in outputs]
+                
+                start_logits, end_logits = output
+                result = SquadResult(unique_id, start_logits, end_logits)
+
+                all_results.append(result)
+
+        answers, _ = compute_predictions_logits(
+            all_examples=examples,
+            all_features=features,
+            all_results=all_results,
+            n_best_size=20,
+            max_answer_length=30,
+            do_lower_case=True,
+            output_prediction_file=None,
+            output_nbest_file=None,
+            output_null_log_odds_file=None,
+            verbose_logging=False,
+            version_2_with_negative=False,
+            null_score_diff_threshold=0,
+            tokenizer=self.tokenizer,
+            language=question.language
+        )
+
+        all_answers = []
+        for idx, ans in enumerate(answers):
+            all_results.append(Answer(
+                text=answers[ans][0],
+                score=answers[ans][1],
+                ctx_score=contexts[idx].score,
+                language=question.language
+            ))
+        return all_answers
+
diff --git a/bertserini/utils_squad.py b/bertserini/utils_squad.py
index 10128b4..7c018ab 100644
--- a/bertserini/utils_squad.py
+++ b/bertserini/utils_squad.py
@@ -20,11 +20,13 @@
 
 logger = logging.getLogger(__name__)
 
+
 def _is_whitespace(c):
     if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
         return True
     return False
 
+
 class SquadExample:
     """
     A single training/test example for the Squad dataset, as loaded from disk.
@@ -49,8 +51,7 @@ def __init__(
         title,
         answers=[],
         is_impossible=False,
-        chinese=False,
-        tokenizer=None,
+        language="en",
     ):
         self.qas_id = qas_id
         self.question_text = question_text
@@ -67,13 +68,10 @@ def __init__(
         prev_is_whitespace = True
 
         # Split on whitespace so that different tokens may be attributed to their original position.
-        if chinese:
-            char_id = 0
-            #paragraph = tokenizer.tokenize(self.context_text)
+        if language == "zh":
             for tok_id, c in enumerate(self.context_text):
                 doc_tokens.append(c)
                 char_to_word_offset.append(len(doc_tokens) - 1)
-                char_id += len(c)
         else:
             for c in self.context_text:
                 if _is_whitespace(c):
@@ -315,7 +313,7 @@ def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_
     return evaluation
 
 
-def get_final_text(pred_text, orig_text, do_lower_case, tokenizer, chinese=False, verbose_logging=False):
+def get_final_text(pred_text, orig_text, do_lower_case, language="zh", verbose_logging=False):
     """Project the tokenized prediction back to the original text."""
 
     # When we created the data, we kept track of the alignment between original
@@ -359,7 +357,7 @@ def _strip_spaces(text):
     # NOT the same length, the heuristic has failed. If they are the same
     # length, we assume the characters are one-to-one aligned.
     tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
-    if chinese:
+    if language=="zh":
         tok_text = "".join(tokenizer.tokenize(orig_text))
     else:
         tok_text = " ".join(tokenizer.tokenize(orig_text))
@@ -460,7 +458,7 @@ def compute_predictions_logits(
     version_2_with_negative,
     null_score_diff_threshold,
     tokenizer,
-    chinese=False,
+    language="en",
 ):
     """Write final predictions to the json file and log-odds of null if needed."""
     if output_prediction_file:
@@ -574,14 +572,14 @@ def compute_predictions_logits(
 
                 # Clean whitespace
                 tok_text = tok_text.strip()
-                if chinese:
+                if language == "zh":
                     tok_text = "".join(tok_text.split())
                     orig_text = "".join(orig_tokens)
                 else:
                     tok_text = " ".join(tok_text.split())
                     orig_text = " ".join(orig_tokens)
 
-                final_text = get_final_text(tok_text, orig_text, do_lower_case, tokenizer, chinese, verbose_logging)
+                final_text = get_final_text(tok_text, orig_text, do_lower_case, language, verbose_logging)
                 if "##" in final_text or "[UNK]" in final_text:
                     print(final_text, "||", tok_text, "||", orig_text)
                 if final_text in seen_predictions:
@@ -662,198 +660,3 @@ def compute_predictions_logits(
 
     return all_predictions, all_nbest_json
 
-
-def compute_predictions_log_probs(
-    all_examples,
-    all_features,
-    all_results,
-    n_best_size,
-    max_answer_length,
-    output_prediction_file,
-    output_nbest_file,
-    output_null_log_odds_file,
-    start_n_top,
-    end_n_top,
-    version_2_with_negative,
-    tokenizer,
-    verbose_logging,
-    chinese=False,
-):
-    """ XLNet write prediction logic (more complex than Bert's).
-        Write final predictions to the json file and log-odds of null if needed.
-
-        Requires utils_squad_evaluate.py
-    """
-    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]
-    )
-
-    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]
-    )
-
-    logger.info("Writing predictions to: %s", output_prediction_file)
-    # logger.info("Writing nbest to: %s" % (output_nbest_file))
-
-    example_index_to_features = collections.defaultdict(list)
-    for feature in all_features:
-        example_index_to_features[feature.example_index].append(feature)
-
-    unique_id_to_result = {}
-    for result in all_results:
-        unique_id_to_result[result.unique_id] = result
-
-    all_predictions = collections.OrderedDict()
-    all_nbest_json = collections.OrderedDict()
-    scores_diff_json = collections.OrderedDict()
-
-    for (example_index, example) in enumerate(all_examples):
-        features = example_index_to_features[example_index]
-
-        prelim_predictions = []
-        # keep track of the minimum score of null start+end of position 0
-        score_null = 1000000  # large and positive
-
-        for (feature_index, feature) in enumerate(features):
-            result = unique_id_to_result[feature.unique_id]
-
-            cur_null_score = result.cls_logits
-
-            # if we could have irrelevant answers, get the min score of irrelevant
-            score_null = min(score_null, cur_null_score)
-
-            for i in range(start_n_top):
-                for j in range(end_n_top):
-                    start_log_prob = result.start_logits[i]
-                    start_index = result.start_top_index[i]
-
-                    j_index = i * end_n_top + j
-
-                    end_log_prob = result.end_logits[j_index]
-                    end_index = result.end_top_index[j_index]
-
-                    # We could hypothetically create invalid predictions, e.g., predict
-                    # that the start of the span is in the question. We throw out all
-                    # invalid predictions.
-                    if start_index >= feature.paragraph_len - 1:
-                        continue
-                    if end_index >= feature.paragraph_len - 1:
-                        continue
-
-                    if not feature.token_is_max_context.get(start_index, False):
-                        continue
-                    if end_index < start_index:
-                        continue
-                    length = end_index - start_index + 1
-                    if length > max_answer_length:
-                        continue
-
-                    prelim_predictions.append(
-                        _PrelimPrediction(
-                            feature_index=feature_index,
-                            start_index=start_index,
-                            end_index=end_index,
-                            start_log_prob=start_log_prob,
-                            end_log_prob=end_log_prob,
-                        )
-                    )
-
-        prelim_predictions = sorted(
-            prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), reverse=True
-        )
-
-        seen_predictions = {}
-        nbest = []
-        for pred in prelim_predictions:
-            if len(nbest) >= n_best_size:
-                break
-            feature = features[pred.feature_index]
-
-            # XLNet un-tokenizer
-            # Let's keep it simple for now and see if we need all this later.
-            #
-            # tok_start_to_orig_index = feature.tok_start_to_orig_index
-            # tok_end_to_orig_index = feature.tok_end_to_orig_index
-            # start_orig_pos = tok_start_to_orig_index[pred.start_index]
-            # end_orig_pos = tok_end_to_orig_index[pred.end_index]
-            # paragraph_text = example.paragraph_text
-            # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
-
-            # Previously used Bert untokenizer
-            tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
-            orig_doc_start = feature.token_to_orig_map[pred.start_index]
-            orig_doc_end = feature.token_to_orig_map[pred.end_index]
-            orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
-            tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
-
-            # Clean whitespace
-            tok_text = tok_text.strip()
-            if chinese:
-                tok_text = "".join(tok_text.split())
-                orig_text = "".join(orig_tokens)
-            else:
-                tok_text = " ".join(tok_text.split())
-                orig_text = " ".join(orig_tokens)
-
-            if hasattr(tokenizer, "do_lower_case"):
-                do_lower_case = tokenizer.do_lower_case
-            else:
-                do_lower_case = tokenizer.do_lowercase_and_remove_accent
-
-            final_text = get_final_text(tok_text, orig_text, do_lower_case, tokenizer, chinese, verbose_logging)
-
-            if final_text in seen_predictions:
-                continue
-
-            seen_predictions[final_text] = True
-
-            nbest.append(
-                _NbestPrediction(text=final_text, start_log_prob=pred.start_log_prob, end_log_prob=pred.end_log_prob)
-            )
-
-        # In very rare edge cases we could have no valid predictions. So we
-        # just create a nonce prediction in this case to avoid failure.
-        if not nbest:
-            nbest.append(_NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6))
-
-        total_scores = []
-        best_non_null_entry = None
-        for entry in nbest:
-            total_scores.append(entry.start_log_prob + entry.end_log_prob)
-            if not best_non_null_entry:
-                best_non_null_entry = entry
-
-        probs = _compute_softmax(total_scores)
-
-        nbest_json = []
-        for (i, entry) in enumerate(nbest):
-            output = collections.OrderedDict()
-            output["text"] = entry.text
-            output["probability"] = probs[i]
-            output["start_log_prob"] = entry.start_log_prob
-            output["end_log_prob"] = entry.end_log_prob
-            nbest_json.append(output)
-
-        assert len(nbest_json) >= 1, "No valid predictions"
-        assert best_non_null_entry is not None, "No valid predictions"
-
-        score_diff = score_null
-        scores_diff_json[example.qas_id] = score_diff
-        # note(zhiliny): always predict best_non_null_entry
-        # and the evaluation script will search for the best threshold
-        all_predictions[example.qas_id] = (best_non_null_entry.text,
-            best_non_null_entry.start_logit + best_non_null_entry.end_logit)
-
-        all_nbest_json[example.qas_id] = nbest_json
-
-    with open(output_prediction_file, "w") as writer:
-        writer.write(json.dumps(all_predictions, indent=4) + "\n")
-
-    with open(output_nbest_file, "w") as writer:
-        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
-
-    if version_2_with_negative:
-        with open(output_null_log_odds_file, "w") as writer:
-            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
-
-    return all_predictions, all_nbest_json

From f5215dd24d38458668f32986011f786e5188f2e7 Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Sun, 20 Sep 2020 07:15:47 +0800
Subject: [PATCH 03/50] fix_bug

---
 bertserini/pyserini_retriever.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bertserini/pyserini_retriever.py b/bertserini/pyserini_retriever.py
index 1d3fd77..55dff0c 100644
--- a/bertserini/pyserini_retriever.py
+++ b/bertserini/pyserini_retriever.py
@@ -8,7 +8,7 @@
 def build_searcher(index_path, k1=0.9, b=0.4, language="en"):
     searcher = SimpleSearcher(index_path)
     searcher.set_bm25(k1, b)
-    searcher.object.setLanguage(language=language)
+    searcher.object.setLanguage(language)
     return searcher
 
 

From 4222fb19fdb493ccd530ba37396efac9620c122c Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Sun, 20 Sep 2020 07:18:15 +0800
Subject: [PATCH 04/50] search Question.text

---
 bertserini/interactive.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bertserini/interactive.py b/bertserini/interactive.py
index 2d8df4a..577158f 100644
--- a/bertserini/interactive.py
+++ b/bertserini/interactive.py
@@ -10,7 +10,7 @@
     while True:
         print("Please input your question[use empty line to exit]:")
         question = Question(input())
-        contexts = retriever(question, searcher, 10)
+        contexts = retriever(question.text, searcher, 10)
         answers = bert_reader.predict(question, contexts)
         for ans in answers:
             ans.aggregate_score(0.45)

From 09a4e2c2b3ca73c2c3097b3c1768f047ec344ac4 Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Sun, 20 Sep 2020 07:24:30 +0800
Subject: [PATCH 05/50] fix bug in bert reader

---
 bertserini/readers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bertserini/readers.py b/bertserini/readers.py
index ccdd415..881128d 100644
--- a/bertserini/readers.py
+++ b/bertserini/readers.py
@@ -102,7 +102,7 @@ def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
 
         all_answers = []
         for idx, ans in enumerate(answers):
-            all_results.append(Answer(
+            all_answers.append(Answer(
                 text=answers[ans][0],
                 score=answers[ans][1],
                 ctx_score=contexts[idx].score,

From 1ea494ae7826b38cb5c487ebc3eca65ef2815e5d Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Sun, 20 Sep 2020 09:42:09 +0800
Subject: [PATCH 06/50] test chinese

---
 bertserini/base.py               |  2 +-
 bertserini/interactive.py        |  8 ++++----
 bertserini/pyserini_retriever.py | 28 +++++++---------------------
 3 files changed, 12 insertions(+), 26 deletions(-)

diff --git a/bertserini/base.py b/bertserini/base.py
index d245d1d..5fb03a5 100644
--- a/bertserini/base.py
+++ b/bertserini/base.py
@@ -118,7 +118,7 @@ def predict(self, query: Question, texts: List[Context]) -> List[Answer]:
         pass
 
 
-def hits_to_contexts(hits: List[JSimpleSearcherResult], field='raw', language="en", blacklist=[]) -> List[Context]:
+def hits_to_contexts(hits: List[JSimpleSearcherResult], language="en", field='raw', blacklist=[]) -> List[Context]:
     """
         Converts hits from Pyserini into a list of texts.
         Parameters
diff --git a/bertserini/interactive.py b/bertserini/interactive.py
index 577158f..6589e75 100644
--- a/bertserini/interactive.py
+++ b/bertserini/interactive.py
@@ -4,13 +4,13 @@
 
 if __name__ == "__main__":
 
-    bert_reader = BERT("rsvp-ai/bertserini-bert-base-squad", "rsvp-ai/bertserini-bert-base-squad")
-    searcher = build_searcher("index/lucene-index.enwiki-20180701-paragraphs")
+    bert_reader = BERT("rsvp-ai/bertserini-bert-base-cmrc", "rsvp-ai/bertserini-bert-base-cmrc")
+    searcher = build_searcher("index/lucene-index.wiki_zh_paragraph_with_title_0.6.0.pos+docvectors")
 
     while True:
         print("Please input your question[use empty line to exit]:")
-        question = Question(input())
-        contexts = retriever(question.text, searcher, 10)
+        question = Question(input(), "zh")
+        contexts = retriever(question, searcher, 10)
         answers = bert_reader.predict(question, contexts)
         for ans in answers:
             ans.aggregate_score(0.45)
diff --git a/bertserini/pyserini_retriever.py b/bertserini/pyserini_retriever.py
index 55dff0c..b800c08 100644
--- a/bertserini/pyserini_retriever.py
+++ b/bertserini/pyserini_retriever.py
@@ -13,27 +13,13 @@ def build_searcher(index_path, k1=0.9, b=0.4, language="en"):
 
 
 def retriever(question, searcher, para_num=20):
+    language = question.language
     try:
-        hits = searcher.search(question, k=para_num)
+        if language == "zh":
+            hits = searcher.search(question.text.encode("utf-8"), k=para_num)
+        else:
+            hits = searcher.search(question.text, k=para_num)
     except ValueError as e:
-        logger.error("Search failure: {}, {}".format(question, e))
+        logger.error("Search failure: {}, {}".format(question.text, e))
         return []
-
-    """
-    paragraphs = []
-    for hit in hits:
-        doc_id = hit.docid
-        score = hit.score
-        text = hit.raw
-
-        if ("||" in text) or ("/><" in text) or \
-           ("|----|" in text) or ("#fffff" in text):
-            continue
-        else:
-            paragraph_dict = {'text': text,
-                              'paragraph_score': score,
-                              'docid': doc_id}
-            paragraphs.append(paragraph_dict)
-            """
-
-    return hits_to_contexts(hits)
+    return hits_to_contexts(hits, language)

From 0c7ef5e4b00d833b4433a989c315de56271b2117 Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Sun, 20 Sep 2020 11:15:03 +0800
Subject: [PATCH 07/50] create utils_new

---
 bertserini/base.py               | 35 +-------------------------------
 bertserini/interactive.py        |  9 ++++----
 bertserini/pyserini_retriever.py | 35 ++++++++++++++++++++++++++++++--
 bertserini/utils_new.py          |  4 ++++
 4 files changed, 42 insertions(+), 41 deletions(-)
 create mode 100644 bertserini/utils_new.py

diff --git a/bertserini/base.py b/bertserini/base.py
index 5fb03a5..75f080e 100644
--- a/bertserini/base.py
+++ b/bertserini/base.py
@@ -1,10 +1,7 @@
 from typing import List, Union, Optional, Mapping, Any
 import abc
 
-from pyserini.search import JSimpleSearcherResult
-
-
-__all__ = ['Question', 'Context', 'Reader', 'Answer', 'hits_to_contexts', 'TextType']
+__all__ = ['Question', 'Context', 'Reader', 'Answer', 'TextType']
 
 
 TextType = Union['Question', 'Context', 'Answer']
@@ -94,7 +91,6 @@ def aggregate_score(self, weight):
         self.total_score = weight*self.score + (1-weight)*self.ctx_score
 
 
-
 class Reader:
     """
     Class representing a Reader.
@@ -116,32 +112,3 @@ def predict(self, query: Question, texts: List[Context]) -> List[Answer]:
                 Predicted list of answer.
         """
         pass
-
-
-def hits_to_contexts(hits: List[JSimpleSearcherResult], language="en", field='raw', blacklist=[]) -> List[Context]:
-    """
-        Converts hits from Pyserini into a list of texts.
-        Parameters
-        ----------
-        hits : List[JSimpleSearcherResult]
-            The hits.
-        field : str
-            Field to use.
-        language : str
-            Language of corpus
-        blacklist : List[str]
-            strings that should not contained
-        Returns
-        -------
-        List[Text]
-            List of texts.
-     """
-    contexts = []
-    for i in range(0, len(hits)):
-        t = hits[i].raw if field == 'raw' else hits[i].contents
-        for s in blacklist:
-            if s in t:
-                continue
-        metadata = {'raw': hits[i].raw, 'docid': hits[i].docid}
-        contexts.append(Context(t, language, metadata, hits[i].score))
-    return contexts
diff --git a/bertserini/interactive.py b/bertserini/interactive.py
index 6589e75..82a646a 100644
--- a/bertserini/interactive.py
+++ b/bertserini/interactive.py
@@ -1,6 +1,7 @@
 from .base import Question
 from .readers import BERT
 from .pyserini_retriever import retriever, build_searcher
+from .utils_new import get_best_answer
 
 if __name__ == "__main__":
 
@@ -11,11 +12,9 @@
         print("Please input your question[use empty line to exit]:")
         question = Question(input(), "zh")
         contexts = retriever(question, searcher, 10)
-        answers = bert_reader.predict(question, contexts)
-        for ans in answers:
-            ans.aggregate_score(0.45)
-        answers.sort(key=lambda x: x.total_score, reverse=True)
-        print(answers[0].text)
+        candidates = bert_reader.predict(question, contexts)
+        answer = get_best_answer(candidates, 0.45)
+        print(answer.text)
 
 
 
diff --git a/bertserini/pyserini_retriever.py b/bertserini/pyserini_retriever.py
index b800c08..1c3e2f8 100644
--- a/bertserini/pyserini_retriever.py
+++ b/bertserini/pyserini_retriever.py
@@ -1,6 +1,8 @@
-from pyserini.search import SimpleSearcher
+from typing import List
+
+from pyserini.search import SimpleSearcher, JSimpleSearcherResult
 from .utils import init_logger
-from .base import hits_to_contexts
+from .base import Context
 
 logger = init_logger("retriever")
 
@@ -23,3 +25,32 @@ def retriever(question, searcher, para_num=20):
         logger.error("Search failure: {}, {}".format(question.text, e))
         return []
     return hits_to_contexts(hits, language)
+
+
+def hits_to_contexts(hits: List[JSimpleSearcherResult], language="en", field='raw', blacklist=[]) -> List[Context]:
+    """
+        Converts hits from Pyserini into a list of texts.
+        Parameters
+        ----------
+        hits : List[JSimpleSearcherResult]
+            The hits.
+        field : str
+            Field to use.
+        language : str
+            Language of corpus
+        blacklist : List[str]
+            strings that should not contained
+        Returns
+        -------
+        List[Text]
+            List of texts.
+     """
+    contexts = []
+    for i in range(0, len(hits)):
+        t = hits[i].raw if field == 'raw' else hits[i].contents
+        for s in blacklist:
+            if s in t:
+                continue
+        metadata = {'raw': hits[i].raw, 'docid': hits[i].docid}
+        contexts.append(Context(t, language, metadata, hits[i].score))
+    return contexts
diff --git a/bertserini/utils_new.py b/bertserini/utils_new.py
new file mode 100644
index 0000000..1cd54dc
--- /dev/null
+++ b/bertserini/utils_new.py
@@ -0,0 +1,4 @@
+def get_best_answer(candidates, weight=0.5):
+    for ans in candidates:
+        ans.aggregate_score(weight)
+    return candidates.sorted(key=lambda x: x.total_score, reverse=True)[0]

From f6b11fb540d774d5c46cd5a8b91bf31717b4b975 Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Tue, 22 Sep 2020 13:44:02 +0800
Subject: [PATCH 08/50] add experiment_squad.py

---
 bertserini/experiment_squad.py | 27 +++++++++++++++++++++++++++
 bertserini/utils_new.py        | 18 ++++++++++++++++++
 2 files changed, 45 insertions(+)
 create mode 100644 bertserini/experiment_squad.py

diff --git a/bertserini/experiment_squad.py b/bertserini/experiment_squad.py
new file mode 100644
index 0000000..d882d3a
--- /dev/null
+++ b/bertserini/experiment_squad.py
@@ -0,0 +1,27 @@
+import json
+from tqdm import tqdm
+from .readers import BERT
+from .pyserini_retriever import retriever, build_searcher
+from .utils_new import extract_squad_questions
+
+if __name__ == "__main__":
+
+    questions = extract_squad_questions("squad_dev.json")
+    bert_reader = BERT("rsvp-ai/bertserini-bert-base-squad", "rsvp-ai/bertserini-bert-base-squad")
+    searcher = build_searcher("index/lucene-index.enwiki-20180701-paragraphs")
+
+    all_answer = []
+    for question in tqdm(questions):
+        contexts = retriever(question, searcher, 10)
+        final_answers = bert_reader.predict(question, contexts)
+        final_answers_lst = []
+        for ans in final_answers:
+            final_answers_lst.append({
+                {"id": question.id,
+                 "answer": ans.text,
+                 "phrase_score": ans.score,
+                 "paragraph_score": ans.ctx_score,
+                 }
+            })
+        all_answer.append(final_answers_lst)
+    json.dump(all_answer, open("result_bert_base.json", 'w'))
diff --git a/bertserini/utils_new.py b/bertserini/utils_new.py
index 1cd54dc..307b8ae 100644
--- a/bertserini/utils_new.py
+++ b/bertserini/utils_new.py
@@ -1,4 +1,22 @@
+import json
+from .base import Question, Answer
+
+
 def get_best_answer(candidates, weight=0.5):
     for ans in candidates:
         ans.aggregate_score(weight)
     return candidates.sorted(key=lambda x: x.total_score, reverse=True)[0]
+
+
+def extract_squad_questions(squad_filename):
+    data = json.load(open(squad_filename, 'r'))
+    data = data["data"]
+    questions = []
+    for article in data:
+        for paragraph in article["paragraphs"]:
+            for qa in paragraph["qas"]:
+                id_ = qa["id"]
+                question = qa["question"]
+                questions.append(Question(question, id_))
+    return questions
+

From d71ade8e60e0ac6013258f3662481dcb74447a8c Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Tue, 22 Sep 2020 15:57:11 +0800
Subject: [PATCH 09/50] fix bug of experiment_squad

---
 bertserini/experiment_squad.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bertserini/experiment_squad.py b/bertserini/experiment_squad.py
index d882d3a..535c005 100644
--- a/bertserini/experiment_squad.py
+++ b/bertserini/experiment_squad.py
@@ -6,7 +6,7 @@
 
 if __name__ == "__main__":
 
-    questions = extract_squad_questions("squad_dev.json")
+    questions = extract_squad_questions("data/dev-v1.1.json")
     bert_reader = BERT("rsvp-ai/bertserini-bert-base-squad", "rsvp-ai/bertserini-bert-base-squad")
     searcher = build_searcher("index/lucene-index.enwiki-20180701-paragraphs")
 
@@ -16,12 +16,12 @@
         final_answers = bert_reader.predict(question, contexts)
         final_answers_lst = []
         for ans in final_answers:
-            final_answers_lst.append({
+            final_answers_lst.append(
                 {"id": question.id,
                  "answer": ans.text,
                  "phrase_score": ans.score,
                  "paragraph_score": ans.ctx_score,
                  }
-            })
+            )
         all_answer.append(final_answers_lst)
     json.dump(all_answer, open("result_bert_base.json", 'w'))

From 2e36092da5706bc2dee859dcf4d49ff08defe338 Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Tue, 22 Sep 2020 16:31:43 +0800
Subject: [PATCH 10/50] add experiment_cmrc

---
 bertserini/aggregate.py            |  10 +-
 bertserini/experiment_cmrc.py      |  27 +++
 bertserini/experiment_squad.py     |   3 +-
 bertserini/utils_new.py            |   4 +-
 bertserini/utils_squad_evaluate.py | 358 -----------------------------
 requirements.txt                   |   1 +
 6 files changed, 34 insertions(+), 369 deletions(-)
 create mode 100644 bertserini/experiment_cmrc.py
 delete mode 100644 bertserini/utils_squad_evaluate.py

diff --git a/bertserini/aggregate.py b/bertserini/aggregate.py
index 103e65d..fe4e294 100755
--- a/bertserini/aggregate.py
+++ b/bertserini/aggregate.py
@@ -1,20 +1,13 @@
 import json
 import argparse
-import os, time, random
-
-# import matplotlib.pyplot as plt
-# from matplotlib.ticker import MultipleLocator
-from tqdm import trange
-from tqdm import tqdm
 import numpy as np
 
 from bertserini.utils import choose_best_answer, weighted_score, normalize_answer, normalize_text, get_type
 
 from bertserini.eval.evaluate_v1 import squad_v1_eval as squad_evaluation
-#from eval.trivia_eval import evaluation as trivia_evaluation
 from bertserini.eval.evaluate_v1_drcd import evaluation as drcd_evaluation
 from bertserini.eval.evaluate_v1_cmrc import evaluate as cmrc_evaluation
-#from eval.evaluate_v1_special import evaluation as special_evaluation
+
 
 def get_score_with_results(eval_data, predictions, mu, dataset):
     answers = {}
@@ -65,6 +58,7 @@ def get_score_with_results(eval_data, predictions, mu, dataset):
     print("mu:{}, result:{}".format(mu, eval_result))
     return eval_result, answers
 
+
 def get_best_mu_with_scores(eval_data, predictions, mu_range, dataset, output_path):
     score_test = {}
     best_mu = 0
diff --git a/bertserini/experiment_cmrc.py b/bertserini/experiment_cmrc.py
new file mode 100644
index 0000000..8a84e49
--- /dev/null
+++ b/bertserini/experiment_cmrc.py
@@ -0,0 +1,27 @@
+import json
+from tqdm import tqdm
+from .readers import BERT
+from .pyserini_retriever import retriever, build_searcher
+from .utils_new import extract_squad_questions
+
+if __name__ == "__main__":
+
+    questions = extract_squad_questions("data/cmrc_dev_squad.json")
+    bert_reader = BERT("rsvp-ai/bertserini-bert-base-cmrc", "rsvp-ai/bertserini-bert-base-cmrc")
+    searcher = build_searcher("index/lucene-index.wiki_zh_paragraph_with_title_0.6.0.pos+docvectors", language="zh")
+
+    all_answer = []
+    for question in tqdm(questions):
+        contexts = retriever(question, searcher, 10)
+        final_answers = bert_reader.predict(question, contexts)
+        final_answers_lst = []
+        for ans in final_answers:
+            final_answers_lst.append(
+                {"id": question.id,
+                 "answer": ans.text,
+                 "phrase_score": ans.score,
+                 "paragraph_score": ans.ctx_score,
+                 }
+            )
+        all_answer.append(final_answers_lst)
+    json.dump(all_answer, open("result_cmrc.json", 'w'), indent=4)
diff --git a/bertserini/experiment_squad.py b/bertserini/experiment_squad.py
index 535c005..a3289a0 100644
--- a/bertserini/experiment_squad.py
+++ b/bertserini/experiment_squad.py
@@ -24,4 +24,5 @@
                  }
             )
         all_answer.append(final_answers_lst)
-    json.dump(all_answer, open("result_bert_base.json", 'w'))
+    json.dump(all_answer, open("result_bert_base.json", 'w'), indent=4)
+
diff --git a/bertserini/utils_new.py b/bertserini/utils_new.py
index 307b8ae..086a03f 100644
--- a/bertserini/utils_new.py
+++ b/bertserini/utils_new.py
@@ -8,7 +8,7 @@ def get_best_answer(candidates, weight=0.5):
     return candidates.sorted(key=lambda x: x.total_score, reverse=True)[0]
 
 
-def extract_squad_questions(squad_filename):
+def extract_squad_questions(squad_filename, language="en"):
     data = json.load(open(squad_filename, 'r'))
     data = data["data"]
     questions = []
@@ -17,6 +17,6 @@ def extract_squad_questions(squad_filename):
             for qa in paragraph["qas"]:
                 id_ = qa["id"]
                 question = qa["question"]
-                questions.append(Question(question, id_))
+                questions.append(Question(question, id_, language))
     return questions
 
diff --git a/bertserini/utils_squad_evaluate.py b/bertserini/utils_squad_evaluate.py
deleted file mode 100644
index cc6c497..0000000
--- a/bertserini/utils_squad_evaluate.py
+++ /dev/null
@@ -1,358 +0,0 @@
-""" Official evaluation script for SQuAD version 2.0.
-    Modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
-
-In addition to basic functionality, we also compute additional statistics and
-plot precision-recall curves if an additional na_prob.json file is provided.
-This file is expected to map question ID's to the model's predicted probability
-that a question is unanswerable.
-"""
-import argparse
-import collections
-import json
-import numpy as np
-import os
-import re
-import string
-import sys
-
-class EVAL_OPTS():
-  def __init__(self, data_file, pred_file, out_file="",
-               na_prob_file="na_prob.json", na_prob_thresh=1.0,
-               out_image_dir=None, verbose=False, cmrc=False):
-    self.data_file = data_file
-    self.pred_file = pred_file
-    self.out_file = out_file
-    self.na_prob_file = na_prob_file
-    self.na_prob_thresh = na_prob_thresh
-    self.out_image_dir = out_image_dir
-    self.verbose = verbose
-    self.cmrc = cmrc
-
-OPTS = None
-
-def parse_args():
-  parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.')
-  parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.')
-  parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.')
-  parser.add_argument('--out-file', '-o', metavar='eval.json',
-                      help='Write accuracy metrics to file (default is stdout).')
-  parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json',
-                      help='Model estimates of probability of no answer.')
-  parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0,
-                      help='Predict "" if no-answer probability exceeds this (default = 1.0).')
-  parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None,
-                      help='Save precision-recall curves to directory.')
-  parser.add_argument('--verbose', '-v', action='store_true')
-  if len(sys.argv) == 1:
-    parser.print_help()
-    sys.exit(1)
-  return parser.parse_args()
-
-def make_qid_to_has_ans(dataset, cmrc=False):
-  qid_to_has_ans = {}
-  if not cmrc:
-      for article in dataset:
-        for p in article['paragraphs']:
-          for qa in p['qas']:
-            qid_to_has_ans[qa['id']] = bool(qa['answers'])
-  else:
-      for article in dataset:
-          for qas in article['qas']:
-            qid_to_has_ans[qas['query_id']] = bool(qas['answers'])
-  return qid_to_has_ans
-
-def normalize_answer(s):
-  """Lower text and remove punctuation, articles and extra whitespace."""
-  def remove_articles(text):
-    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
-    return re.sub(regex, ' ', text)
-  def white_space_fix(text):
-    return ' '.join(text.split())
-  def remove_punc(text):
-    exclude = set(string.punctuation)
-    return ''.join(ch for ch in text if ch not in exclude)
-  def lower(text):
-    return text.lower()
-  s = str(s)
-  return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-def get_tokens(s):
-  if not s: return []
-  return normalize_answer(s).split()
-
-def compute_exact(a_gold, a_pred):
-  return int(normalize_answer(a_gold) == normalize_answer(a_pred))
-
-def compute_f1(a_gold, a_pred):
-  gold_toks = get_tokens(a_gold)
-  pred_toks = get_tokens(a_pred)
-  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
-  num_same = sum(common.values())
-  if len(gold_toks) == 0 or len(pred_toks) == 0:
-    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
-    return int(gold_toks == pred_toks)
-  if num_same == 0:
-    return 0
-  precision = 1.0 * num_same / len(pred_toks)
-  recall = 1.0 * num_same / len(gold_toks)
-  f1 = (2 * precision * recall) / (precision + recall)
-  return f1
-
-def get_raw_scores(dataset, preds, cmrc=False):
-  exact_scores = {}
-  f1_scores = {}
-  if not cmrc:
-      for article in dataset:
-        for p in article['paragraphs']:
-          for qa in p['qas']:
-            qid = qa['id']
-            gold_answers = [a['text'] for a in qa['answers']
-                            if normalize_answer(a['text'])]
-            if not gold_answers:
-              # For unanswerable questions, only correct answer is empty string
-              gold_answers = ['']
-            if qid not in preds:
-              print('Missing prediction for %s' % qid)
-              continue
-            a_pred = preds[qid]
-            # Take max over all gold answers
-            exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
-            f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)
-  else:
-      for article in dataset:
-          for qa in article['qas']:
-            qid = qa['query_id']
-            gold_answers = [a for a in qa['answers']
-                            if normalize_answer(a)]
-            if not gold_answers:
-              # For unanswerable questions, only correct answer is empty string
-              gold_answers = ['']
-            if qid not in preds:
-              print('Missing prediction for %s' % qid)
-              continue
-            a_pred = preds[qid]
-            # Take max over all gold answers
-            exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
-            f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)
-
-  return exact_scores, f1_scores
-
-def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
-  new_scores = {}
-  for qid, s in scores.items():
-    pred_na = na_probs[qid] > na_prob_thresh
-    if pred_na:
-      new_scores[qid] = float(not qid_to_has_ans[qid])
-    else:
-      new_scores[qid] = s
-  return new_scores
-
-def make_eval_dict(exact_scores, f1_scores, qid_list=None):
-  if not qid_list:
-    total = len(exact_scores)
-    return collections.OrderedDict([
-        ('exact', 100.0 * sum(exact_scores.values()) / total),
-        ('f1', 100.0 * sum(f1_scores.values()) / total),
-        ('total', total),
-    ])
-  else:
-    total = len(qid_list)
-    return collections.OrderedDict([
-        ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
-        ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
-        ('total', total),
-    ])
-
-def merge_eval(main_eval, new_eval, prefix):
-  for k in new_eval:
-    main_eval['%s_%s' % (prefix, k)] = new_eval[k]
-
-def plot_pr_curve(precisions, recalls, out_image, title):
-  plt.step(recalls, precisions, color='b', alpha=0.2, where='post')
-  plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b')
-  plt.xlabel('Recall')
-  plt.ylabel('Precision')
-  plt.xlim([0.0, 1.05])
-  plt.ylim([0.0, 1.05])
-  plt.title(title)
-  plt.savefig(out_image)
-  plt.clf()
-
-def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans,
-                               out_image=None, title=None):
-  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
-  true_pos = 0.0
-  cur_p = 1.0
-  cur_r = 0.0
-  precisions = [1.0]
-  recalls = [0.0]
-  avg_prec = 0.0
-  for i, qid in enumerate(qid_list):
-    if qid_to_has_ans[qid]:
-      true_pos += scores[qid]
-    cur_p = true_pos / float(i+1)
-    cur_r = true_pos / float(num_true_pos)
-    if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]:
-      # i.e., if we can put a threshold after this point
-      avg_prec += cur_p * (cur_r - recalls[-1])
-      precisions.append(cur_p)
-      recalls.append(cur_r)
-  if out_image:
-    plot_pr_curve(precisions, recalls, out_image, title)
-  return {'ap': 100.0 * avg_prec}
-
-def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs, 
-                                  qid_to_has_ans, out_image_dir):
-  if out_image_dir and not os.path.exists(out_image_dir):
-    os.makedirs(out_image_dir)
-  num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
-  if num_true_pos == 0:
-    return
-  pr_exact = make_precision_recall_eval(
-      exact_raw, na_probs, num_true_pos, qid_to_has_ans,
-      out_image=os.path.join(out_image_dir, 'pr_exact.png'),
-      title='Precision-Recall curve for Exact Match score')
-  pr_f1 = make_precision_recall_eval(
-      f1_raw, na_probs, num_true_pos, qid_to_has_ans,
-      out_image=os.path.join(out_image_dir, 'pr_f1.png'),
-      title='Precision-Recall curve for F1 score')
-  oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
-  pr_oracle = make_precision_recall_eval(
-      oracle_scores, na_probs, num_true_pos, qid_to_has_ans,
-      out_image=os.path.join(out_image_dir, 'pr_oracle.png'),
-      title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)')
-  merge_eval(main_eval, pr_exact, 'pr_exact')
-  merge_eval(main_eval, pr_f1, 'pr_f1')
-  merge_eval(main_eval, pr_oracle, 'pr_oracle')
-
-def histogram_na_prob(na_probs, qid_list, image_dir, name):
-  if not qid_list:
-    return
-  x = [na_probs[k] for k in qid_list]
-  weights = np.ones_like(x) / float(len(x))
-  plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
-  plt.xlabel('Model probability of no-answer')
-  plt.ylabel('Proportion of dataset')
-  plt.title('Histogram of no-answer probability: %s' % name)
-  plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name))
-  plt.clf()
-
-def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
-  num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
-  cur_score = num_no_ans
-  best_score = cur_score
-  best_thresh = 0.0
-  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
-  for i, qid in enumerate(qid_list):
-    if qid not in scores: continue
-    if qid_to_has_ans[qid]:
-      diff = scores[qid]
-    else:
-      if preds[qid]:
-        diff = -1
-      else:
-        diff = 0
-    cur_score += diff
-    if cur_score > best_score:
-      best_score = cur_score
-      best_thresh = na_probs[qid]
-  return 100.0 * best_score / len(scores), best_thresh
-
-def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
-  num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
-  cur_score = num_no_ans
-  best_score = cur_score
-  best_thresh = 0.0
-  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
-  for i, qid in enumerate(qid_list):
-    if qid not in scores: continue
-    if qid_to_has_ans[qid]:
-      diff = scores[qid]
-    else:
-      if preds[qid]:
-        diff = -1
-      else:
-        diff = 0
-    cur_score += diff
-    if cur_score > best_score:
-      best_score = cur_score
-      best_thresh = na_probs[qid]
-
-  has_ans_score, has_ans_cnt = 0, 0
-  for qid in qid_list:
-    if not qid_to_has_ans[qid]: continue
-    has_ans_cnt += 1
-
-    if qid not in scores: continue
-    has_ans_score += scores[qid]
-
-  return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
-
-def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
-  best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
-  best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
-  main_eval['best_exact'] = best_exact
-  main_eval['best_exact_thresh'] = exact_thresh
-  main_eval['best_f1'] = best_f1
-  main_eval['best_f1_thresh'] = f1_thresh
-
-def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
-  best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
-  best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
-  main_eval['best_exact'] = best_exact
-  main_eval['best_exact_thresh'] = exact_thresh
-  main_eval['best_f1'] = best_f1
-  main_eval['best_f1_thresh'] = f1_thresh
-  main_eval['has_ans_exact'] = has_ans_exact
-  main_eval['has_ans_f1'] = has_ans_f1
-
-def main(OPTS):
-  with open(OPTS.data_file) as f:
-    dataset_json = json.load(f)
-    if OPTS.cmrc:
-        dataset = dataset_json
-    else:
-        dataset = dataset_json['data']
-  with open(OPTS.pred_file) as f:
-    preds = json.load(f)
-  if OPTS.na_prob_file:
-    with open(OPTS.na_prob_file) as f:
-      na_probs = json.load(f)
-  else:
-    na_probs = {k: 0.0 for k in preds}
-  qid_to_has_ans = make_qid_to_has_ans(dataset, OPTS.cmrc)  # maps qid to True/False
-  has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
-  no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
-  exact_raw, f1_raw = get_raw_scores(dataset, preds, OPTS.cmrc)
-  exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
-                                        OPTS.na_prob_thresh)
-  f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
-                                     OPTS.na_prob_thresh)
-  out_eval = make_eval_dict(exact_thresh, f1_thresh)
-  if has_ans_qids:
-    has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
-    merge_eval(out_eval, has_ans_eval, 'HasAns')
-  if no_ans_qids:
-    no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
-    merge_eval(out_eval, no_ans_eval, 'NoAns')
-  if OPTS.na_prob_file:
-    find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans)
-  if OPTS.na_prob_file and OPTS.out_image_dir:
-    run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs, 
-                                  qid_to_has_ans, OPTS.out_image_dir)
-    histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns')
-    histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns')
-  if OPTS.out_file:
-    with open(OPTS.out_file, 'w') as f:
-      json.dump(out_eval, f)
-  else:
-    print(json.dumps(out_eval, indent=2))
-  return out_eval
-
-if __name__ == '__main__':
-  OPTS = parse_args()
-  if OPTS.out_image_dir:
-    import matplotlib
-    matplotlib.use('Agg')
-    import matplotlib.pyplot as plt 
-  main(OPTS)
diff --git a/requirements.txt b/requirements.txt
index 3ffca18..8f946e4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+tqdm
 numpy
 pyserini
 transformers

From a521329faf6b77b59e42e092c4f2e3dde20f46c0 Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Tue, 22 Sep 2020 17:00:28 +0800
Subject: [PATCH 11/50] change repo structure

---
 bertserini/bert_reader.py                     | 249 ------------------
 bertserini/{eval => experiments}/__init__.py  |   0
 bertserini/experiments/eval/__init__.py       |   0
 .../{ => experiments}/eval/evaluate_v1.py     |   3 +-
 .../eval/evaluate_v1_cmrc.py                  |   0
 .../eval/evaluate_v1_drcd.py                  |   0
 .../{aggregate.py => experiments/evaluate.py} |  12 +-
 .../{ => experiments}/experiment_cmrc.py      |   6 +-
 .../{ => experiments}/experiment_squad.py     |   6 +-
 bertserini/interactive.py                     |   8 +-
 bertserini/reader/__init__.py                 |   0
 bertserini/{ => reader}/base.py               |   0
 .../{readers.py => reader/bert_reader.py}     |   6 +-
 bertserini/retriever/__init__.py              |   0
 .../{ => retriever}/pyserini_retriever.py     |   4 +-
 bertserini/search.py                          |  70 -----
 bertserini/utils/__init__.py                  |   0
 bertserini/{ => utils}/utils.py               |   0
 bertserini/{ => utils}/utils_new.py           |   2 +-
 bertserini/{ => utils}/utils_squad.py         |   0
 20 files changed, 21 insertions(+), 345 deletions(-)
 delete mode 100644 bertserini/bert_reader.py
 rename bertserini/{eval => experiments}/__init__.py (100%)
 create mode 100644 bertserini/experiments/eval/__init__.py
 rename bertserini/{ => experiments}/eval/evaluate_v1.py (98%)
 rename bertserini/{ => experiments}/eval/evaluate_v1_cmrc.py (100%)
 rename bertserini/{ => experiments}/eval/evaluate_v1_drcd.py (100%)
 rename bertserini/{aggregate.py => experiments/evaluate.py} (86%)
 rename bertserini/{ => experiments}/experiment_cmrc.py (83%)
 rename bertserini/{ => experiments}/experiment_squad.py (82%)
 create mode 100644 bertserini/reader/__init__.py
 rename bertserini/{ => reader}/base.py (100%)
 rename bertserini/{readers.py => reader/bert_reader.py} (95%)
 create mode 100644 bertserini/retriever/__init__.py
 rename bertserini/{ => retriever}/pyserini_retriever.py (94%)
 delete mode 100644 bertserini/search.py
 create mode 100644 bertserini/utils/__init__.py
 rename bertserini/{ => utils}/utils.py (100%)
 rename bertserini/{ => utils}/utils_new.py (93%)
 rename bertserini/{ => utils}/utils_squad.py (100%)

diff --git a/bertserini/bert_reader.py b/bertserini/bert_reader.py
deleted file mode 100644
index 46e1349..0000000
--- a/bertserini/bert_reader.py
+++ /dev/null
@@ -1,249 +0,0 @@
-import os
-import logging
-import torch
-from torch.utils.data import DataLoader, SequentialSampler
-from transformers.data.processors.squad import SquadResult
-from bertserini.run_squad import to_list
-from bertserini.utils_squad import compute_predictions_log_probs, compute_predictions_logits, SquadExample
-from transformers import (
-    AutoModelForQuestionAnswering,
-    AutoTokenizer,
-    squad_convert_examples_to_features,
-)
-
-
-logger = logging.getLogger(__name__)
-
-
-class MySquadExample(SquadExample):
-    def __init__(self,
-                 qas_id,
-                 question_text,
-                 context_text,
-                 answer_text,
-                 start_position_character,
-                 title,
-                 answers=[],
-                 is_impossible=False,
-                 paragraph_score=0,
-                 chinese=False,
-                 tokenizer=None):
-        super(MySquadExample, self).__init__(
-            qas_id,
-            question_text,
-            context_text,
-            answer_text,
-            start_position_character,
-            title,
-            answers,
-            is_impossible,
-            chinese,
-            tokenizer,
-        )
-        self.paragraph_score = paragraph_score
-
-
-def create_inference_examples(query, paragraphs, paragraph_scores, chinese=False, tokenizer=None):
-    examples = []
-    for (id, paragraph) in enumerate(paragraphs):
-        example = MySquadExample(
-            qas_id=id,
-            question_text=query,
-            context_text=paragraph,
-            answer_text=None,
-            start_position_character=None,
-            title="",
-            is_impossible=False,
-            answers=[],
-            paragraph_score=paragraph_scores[id],
-            chinese=chinese,
-            tokenizer=tokenizer,
-        )
-        id += 1
-        examples.append(example)
-
-    return examples
-
-
-class BertReader:
-    def __init__(self, args):
-        super(BertReader, self).__init__()
-        self.args = args
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-            do_lower_case=args.do_lower_case,
-            cache_dir=args.cache_dir if args.cache_dir else None,
-        )
-        checkpoint = self.args.model_name_or_path
-
-        logger.info("Evaluate the following checkpoints: %s", checkpoint)
-
-        # Reload the model
-        global_step = ""
-        self.model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)  # , force_download=True)
-        self.model = self.model.to(args.device)
-
-        self.model.eval()
-
-    def predict(self, id_, question, paragraph_texts, paragraph_scores):
-        # dataset, examples, features = load_and_cache_examples(self.args, self.tokenizer, evaluate=True, output_examples=True)
-
-        # processor = SquadV2Processor() if self.args.version_2_with_negative else SquadV1Processor()
-        # todo convert to single query examples
-        examples = create_inference_examples(
-            question,
-            paragraph_texts,
-            paragraph_scores,
-            chinese=self.args.chinese,
-            tokenizer=self.tokenizer)
-
-        features, dataset = squad_convert_examples_to_features(
-            examples=examples,
-            tokenizer=self.tokenizer,
-            max_seq_length=self.args.max_seq_length,
-            doc_stride=self.args.doc_stride,
-            max_query_length=self.args.max_query_length,
-            is_training=False,
-            return_dataset="pt",
-            threads=self.args.threads,
-            tqdm_enabled=False
-        )
-
-        # if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-        #     os.makedirs(args.output_dir)
-
-        self.args.eval_batch_size = self.args.per_gpu_eval_batch_size * max(1, self.args.n_gpu)
-
-        # Note that DistributedSampler samples randomly
-        eval_sampler = SequentialSampler(dataset)
-        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.args.eval_batch_size)
-
-        # multi-gpu evaluate
-        if self.args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel):
-            self.model = torch.nn.DataParallel(self.model)
-
-        # Eval!
-        # logger.info("***** Running evaluation {} *****".format(prefix))
-        # logger.info("  Num examples = %d", len(dataset))
-        # logger.info("  Batch size = %d", args.eval_batch_size)
-
-        all_results = []
-        # start_time = timeit.default_timer()
-
-        for batch in eval_dataloader:
-            self.model.eval()
-            batch = tuple(t.to(self.args.device) for t in batch)
-
-            with torch.no_grad():
-                inputs = {
-                    "input_ids": batch[0],
-                    "attention_mask": batch[1],
-                    "token_type_ids": batch[2],
-                }
-
-                # if args.model_type in ["xlm", "roberta", "distilbert", "camembert"]:
-                #     del inputs["token_type_ids"]
-
-                feature_indices = batch[3]
-
-                # XLNet and XLM use more arguments for their predictions
-                # if args.model_type in ["xlnet", "xlm"]:
-                #     inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
-                #     # for lang_id-sensitive xlm models
-                #     if hasattr(model, "config") and hasattr(model.config, "lang2id"):
-                #         inputs.update(
-                #             {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
-                #         )
-
-                outputs = self.model(**inputs)
-
-            for i, feature_index in enumerate(feature_indices):
-                eval_feature = features[feature_index.item()]
-                unique_id = int(eval_feature.unique_id)
-
-                output = [to_list(output[i]) for output in outputs]
-
-                # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
-                # models only use two.
-                if len(output) >= 5:
-                    start_logits = output[0]
-                    start_top_index = output[1]
-                    end_logits = output[2]
-                    end_top_index = output[3]
-                    cls_logits = output[4]
-
-                    result = SquadResult(
-                        unique_id,
-                        start_logits,
-                        end_logits,
-                        start_top_index=start_top_index,
-                        end_top_index=end_top_index,
-                        cls_logits=cls_logits,
-                    )
-
-                else:
-                    start_logits, end_logits = output
-                    result = SquadResult(unique_id, start_logits, end_logits)
-
-                all_results.append(result)
-
-        # Compute predictions
-        prefix = ""
-        output_prediction_file = os.path.join(self.args.output_dir, "predictions_{}.json".format(prefix))
-        output_nbest_file = os.path.join(self.args.output_dir, "nbest_predictions_{}.json".format(prefix))
-
-        if self.args.version_2_with_negative:
-            output_null_log_odds_file = os.path.join(self.args.output_dir, "null_odds_{}.json".format(prefix))
-        else:
-            output_null_log_odds_file = None
-
-        # XLNet and XLM use a more complex post-processing procedure
-        if self.args.model_type in ["xlnet", "xlm"]:
-            start_n_top = self.model.config.start_n_top if hasattr(self.model,
-                                                                   "config") else self.model.module.config.start_n_top
-            end_n_top = self.model.config.end_n_top if hasattr(self.model,
-                                                               "config") else self.model.module.config.end_n_top
-
-            answers, nbest_answers = compute_predictions_log_probs(
-                examples,
-                features,
-                all_results,
-                self.args.n_best_size,
-                self.args.max_answer_length,
-                output_prediction_file,
-                output_nbest_file,
-                output_null_log_odds_file,
-                start_n_top,
-                end_n_top,
-                self.args.version_2_with_negative,
-                self.tokenizer,
-                self.args.verbose_logging,
-                self.args.chinese
-            )
-        else:
-            answers, nbest_answers = compute_predictions_logits(
-                examples,
-                features,
-                all_results,
-                self.args.n_best_size,
-                self.args.max_answer_length,
-                self.args.do_lower_case,
-                output_prediction_file,
-                output_nbest_file,
-                output_null_log_odds_file,
-                self.args.verbose_logging,
-                self.args.version_2_with_negative,
-                self.args.null_score_diff_threshold,
-                self.tokenizer,
-                self.args.chinese
-            )
-
-        all_answers = []
-        for answer_id, ans in enumerate(answers):
-            ans_dict = {"id": id_,
-                        "answer": answers[ans][0],
-                        "phrase_score": answers[ans][1],
-                        "paragraph_score": paragraph_scores[answer_id],
-                        }
-            all_answers.append(ans_dict)
-        return all_answers
diff --git a/bertserini/eval/__init__.py b/bertserini/experiments/__init__.py
similarity index 100%
rename from bertserini/eval/__init__.py
rename to bertserini/experiments/__init__.py
diff --git a/bertserini/experiments/eval/__init__.py b/bertserini/experiments/eval/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/bertserini/eval/evaluate_v1.py b/bertserini/experiments/eval/evaluate_v1.py
similarity index 98%
rename from bertserini/eval/evaluate_v1.py
rename to bertserini/experiments/eval/evaluate_v1.py
index 3eb73f2..b853597 100755
--- a/bertserini/eval/evaluate_v1.py
+++ b/bertserini/experiments/eval/evaluate_v1.py
@@ -3,9 +3,8 @@
 from collections import Counter
 import argparse
 import json
-import sys
 
-from bertserini.utils import normalize_answer, init_logger
+from bertserini.utils.utils import normalize_answer, init_logger
 
 logger = init_logger("evluation")
 
diff --git a/bertserini/eval/evaluate_v1_cmrc.py b/bertserini/experiments/eval/evaluate_v1_cmrc.py
similarity index 100%
rename from bertserini/eval/evaluate_v1_cmrc.py
rename to bertserini/experiments/eval/evaluate_v1_cmrc.py
diff --git a/bertserini/eval/evaluate_v1_drcd.py b/bertserini/experiments/eval/evaluate_v1_drcd.py
similarity index 100%
rename from bertserini/eval/evaluate_v1_drcd.py
rename to bertserini/experiments/eval/evaluate_v1_drcd.py
diff --git a/bertserini/aggregate.py b/bertserini/experiments/evaluate.py
similarity index 86%
rename from bertserini/aggregate.py
rename to bertserini/experiments/evaluate.py
index fe4e294..9021302 100755
--- a/bertserini/aggregate.py
+++ b/bertserini/experiments/evaluate.py
@@ -2,11 +2,11 @@
 import argparse
 import numpy as np
 
-from bertserini.utils import choose_best_answer, weighted_score, normalize_answer, normalize_text, get_type
+from bertserini.utils.utils import choose_best_answer, weighted_score
 
-from bertserini.eval.evaluate_v1 import squad_v1_eval as squad_evaluation
-from bertserini.eval.evaluate_v1_drcd import evaluation as drcd_evaluation
-from bertserini.eval.evaluate_v1_cmrc import evaluate as cmrc_evaluation
+from bertserini.experiments.eval.evaluate_v1 import squad_v1_eval as squad_evaluation
+from bertserini.experiments.eval.evaluate_v1_drcd import evaluation as drcd_evaluation
+from bertserini.experiments.eval.evaluate_v1_cmrc import evaluate as cmrc_evaluation
 
 
 def get_score_with_results(eval_data, predictions, mu, dataset):
@@ -46,12 +46,8 @@ def get_score_with_results(eval_data, predictions, mu, dataset):
                         "exact_match": eval_result[1],
                         "total_count": eval_result[2],
                         "skip_count": eval_result[3]}
-    elif args.dataset == "trivia":
-        eval_result = trivia_evaluation(eval_data, "tmp.answer")
     elif args.dataset == "drcd":
         eval_result = drcd_evaluation(eval_data, "tmp.answer")
-    elif args.dataset == "special":
-        eval_result = special_evaluation(eval_data, "tmp.answer")
     else:
         eval_result = squad_evaluation(eval_data, "tmp.answer")
 
diff --git a/bertserini/experiment_cmrc.py b/bertserini/experiments/experiment_cmrc.py
similarity index 83%
rename from bertserini/experiment_cmrc.py
rename to bertserini/experiments/experiment_cmrc.py
index 8a84e49..ee697ef 100644
--- a/bertserini/experiment_cmrc.py
+++ b/bertserini/experiments/experiment_cmrc.py
@@ -1,8 +1,8 @@
 import json
 from tqdm import tqdm
-from .readers import BERT
-from .pyserini_retriever import retriever, build_searcher
-from .utils_new import extract_squad_questions
+from bertserini.reader.bert_reader import BERT
+from bertserini.retriever.pyserini_retriever import retriever, build_searcher
+from bertserini.utils.utils_new import extract_squad_questions
 
 if __name__ == "__main__":
 
diff --git a/bertserini/experiment_squad.py b/bertserini/experiments/experiment_squad.py
similarity index 82%
rename from bertserini/experiment_squad.py
rename to bertserini/experiments/experiment_squad.py
index a3289a0..84ed298 100644
--- a/bertserini/experiment_squad.py
+++ b/bertserini/experiments/experiment_squad.py
@@ -1,8 +1,8 @@
 import json
 from tqdm import tqdm
-from .readers import BERT
-from .pyserini_retriever import retriever, build_searcher
-from .utils_new import extract_squad_questions
+from bertserini.reader.bert_reader import BERT
+from bertserini.retriever.pyserini_retriever import retriever, build_searcher
+from bertserini.utils.utils_new import extract_squad_questions
 
 if __name__ == "__main__":
 
diff --git a/bertserini/interactive.py b/bertserini/interactive.py
index 82a646a..e410ed6 100644
--- a/bertserini/interactive.py
+++ b/bertserini/interactive.py
@@ -1,7 +1,7 @@
-from .base import Question
-from .readers import BERT
-from .pyserini_retriever import retriever, build_searcher
-from .utils_new import get_best_answer
+from bertserini.reader.base import Question
+from bertserini.reader.bert_reader import BERT
+from bertserini.retriever.pyserini_retriever import retriever, build_searcher
+from bertserini.utils.utils_new import get_best_answer
 
 if __name__ == "__main__":
 
diff --git a/bertserini/reader/__init__.py b/bertserini/reader/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/bertserini/base.py b/bertserini/reader/base.py
similarity index 100%
rename from bertserini/base.py
rename to bertserini/reader/base.py
diff --git a/bertserini/readers.py b/bertserini/reader/bert_reader.py
similarity index 95%
rename from bertserini/readers.py
rename to bertserini/reader/bert_reader.py
index 881128d..e24de9c 100644
--- a/bertserini/readers.py
+++ b/bertserini/reader/bert_reader.py
@@ -5,13 +5,13 @@
 import torch
 from transformers.data.processors.squad import SquadResult
 
-from .base import Reader, Question, Context, Answer
+from bertserini.reader.base import Reader, Question, Context, Answer
 
 __all__ = ['BERT']
 
-from .run_squad import to_list
+from bertserini.run_squad import to_list
 
-from .utils_squad import SquadExample, compute_predictions_logits
+from bertserini.utils.utils_squad import SquadExample, compute_predictions_logits
 
 
 def craft_squad_examples(question: Question, contexts: List[Context]) -> List[SquadExample]:
diff --git a/bertserini/retriever/__init__.py b/bertserini/retriever/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/bertserini/pyserini_retriever.py b/bertserini/retriever/pyserini_retriever.py
similarity index 94%
rename from bertserini/pyserini_retriever.py
rename to bertserini/retriever/pyserini_retriever.py
index 1c3e2f8..4a6df79 100644
--- a/bertserini/pyserini_retriever.py
+++ b/bertserini/retriever/pyserini_retriever.py
@@ -1,8 +1,8 @@
 from typing import List
 
 from pyserini.search import SimpleSearcher, JSimpleSearcherResult
-from .utils import init_logger
-from .base import Context
+from bertserini.utils.utils import init_logger
+from bertserini.reader.base import Context
 
 logger = init_logger("retriever")
 
diff --git a/bertserini/search.py b/bertserini/search.py
deleted file mode 100644
index 7aa77bb..0000000
--- a/bertserini/search.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import json
-import time
-from tqdm import trange
-
-from hanziconv import HanziConv
-
-from bertserini.bert_reader import BertReader
-from bertserini.pyserini_retriever import retriever, build_searcher
-from bertserini.utils import (convert_squad_to_list, normalize_text, strip_accents)
-
-from bertserini.args import *
-
-if __name__ == "__main__":
-    """
-        Connect anserini with bert.
-        Question from SQuAD1.0-dev
-        search paragraph using Anserini (Top 1)
-        extract phrase using Bert (SQuAD1.0 pretrained version)
-    """
-    #logger = init_logger("bert_search")
-
-    QAs = convert_squad_to_list(args.predict_file)
-
-    bert_reader = BertReader(args)
-    ansrini_searcher = build_searcher(args.k1, args.b, args.index_path, args.rm3, chinese=args.chinese)
-
-    count_hit = [0] * (args.para_num)
-    count_total = [0] * (args.para_num)
-
-    all_results = []
-
-    for question_id in trange(len(QAs)):
-        start_time = time.time()
-        question = strip_accents(QAs[question_id]['question']) # convert Latin into English
-        if args.chinese:
-            if args.toSimplified:
-                question = HanziConv.toSimplified(question)
-            paragraphs = retriever(question, ansrini_searcher, args.para_num)
-        else:
-            paragraphs = retriever(question, ansrini_searcher, args.para_num)
-        if len(paragraphs) == 0:
-            continue
-        paragraph_texts = []
-        paragraph_scores = []
-        hit_flag = False
-        for paragraph_id, paragraph in enumerate(paragraphs):
-            paragraph_texts.append(paragraph['text'])
-            paragraph_scores.append(paragraph['paragraph_score'])
-            count_total[paragraph_id] += 1
-            if hit_flag:
-                count_hit[paragraph_id] += 1
-                continue
-            for k in range(len(QAs[question_id]['answers'])):
-                if normalize_text(QAs[question_id]['answers'][k]["text"]) in normalize_text(paragraph['text']):
-                    count_hit[paragraph_id] += 1
-                    hit_flag = True
-                    break
-        
-        final_answers = bert_reader.predict(QAs[question_id]['id'], question, paragraph_texts, paragraph_scores)
-
-        all_results.append(final_answers)
-        print(final_answers)
-
-    json.dump(all_results, open(args.output_fn, 'w'))
-
-    #logger.info("=======================================")
-    #logger.info("count_total {} count_hit {}".format(count_total, count_hit))
-    json.dump([count_total, count_hit], open("count_{}.json".format(args.output_fn), "w"))
-    #logger.info("=======================================")
-
diff --git a/bertserini/utils/__init__.py b/bertserini/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/bertserini/utils.py b/bertserini/utils/utils.py
similarity index 100%
rename from bertserini/utils.py
rename to bertserini/utils/utils.py
diff --git a/bertserini/utils_new.py b/bertserini/utils/utils_new.py
similarity index 93%
rename from bertserini/utils_new.py
rename to bertserini/utils/utils_new.py
index 086a03f..6e0f537 100644
--- a/bertserini/utils_new.py
+++ b/bertserini/utils/utils_new.py
@@ -1,5 +1,5 @@
 import json
-from .base import Question, Answer
+from bertserini.reader.base import Question
 
 
 def get_best_answer(candidates, weight=0.5):
diff --git a/bertserini/utils_squad.py b/bertserini/utils/utils_squad.py
similarity index 100%
rename from bertserini/utils_squad.py
rename to bertserini/utils/utils_squad.py

From d819109343febb7c547a606809181f04bb213eb5 Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Tue, 22 Sep 2020 17:13:29 +0800
Subject: [PATCH 12/50] update for cmrc

---
 bertserini/reader/bert_reader.py | 4 +++-
 bertserini/utils/utils_new.py    | 3 +++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/bertserini/reader/bert_reader.py b/bertserini/reader/bert_reader.py
index e24de9c..c8cb6af 100644
--- a/bertserini/reader/bert_reader.py
+++ b/bertserini/reader/bert_reader.py
@@ -34,7 +34,9 @@ def craft_squad_examples(question: Question, contexts: List[Context]) -> List[Sq
 
 
 class BERT(Reader):
-    def __init__(self, model_name: str, tokenizer_name: str):
+    def __init__(self, model_name: str, tokenizer_name: str = None):
+        if tokenizer_name is None:
+            tokenizer_name = model_name
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.model = AutoModelForQuestionAnswering.from_pretrained(model_name).to(self.device).eval()
         self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=True)
diff --git a/bertserini/utils/utils_new.py b/bertserini/utils/utils_new.py
index 6e0f537..6c3bef4 100644
--- a/bertserini/utils/utils_new.py
+++ b/bertserini/utils/utils_new.py
@@ -1,4 +1,5 @@
 import json
+from hanziconv import HanziConv
 from bertserini.reader.base import Question
 
 
@@ -17,6 +18,8 @@ def extract_squad_questions(squad_filename, language="en"):
             for qa in paragraph["qas"]:
                 id_ = qa["id"]
                 question = qa["question"]
+                if language == "zh":
+                    HanziConv.toSimplified(question)
                 questions.append(Question(question, id_, language))
     return questions
 

From 25ec43ec5ad2b5268cf8420f77d8b6ca8d502b06 Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Tue, 22 Sep 2020 17:21:57 +0800
Subject: [PATCH 13/50] remove args.py

---
 bertserini/args.py                  | 242 ----------------------------
 bertserini/reader/bert_reader.py    |   2 +-
 bertserini/train/__init__.py        |   0
 bertserini/{ => train}/run_squad.py |   2 -
 4 files changed, 1 insertion(+), 245 deletions(-)
 delete mode 100644 bertserini/args.py
 create mode 100644 bertserini/train/__init__.py
 rename bertserini/{ => train}/run_squad.py (99%)

diff --git a/bertserini/args.py b/bertserini/args.py
deleted file mode 100644
index 5434d33..0000000
--- a/bertserini/args.py
+++ /dev/null
@@ -1,242 +0,0 @@
-import argparse
-import pprint
-import torch
-# from transformers import (WEIGHTS_NAME, BertConfig,
-#                                   BertTokenizer,
-#                                   XLMConfig, XLMForQuestionAnswering,
-#                                   XLMTokenizer, XLNetConfig,
-#                                   XLNetForQuestionAnswering,
-#                                   XLNetTokenizer)
-# from transformers import BertForQuestionAnswering
-
-# ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
-#                   for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
-# MODEL_CLASSES = {
-#     'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
-#     'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
-#     'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
-# }
-
-parser = argparse.ArgumentParser()
-
-## Other parameters
-
-parser.add_argument("--eval_batch_size", default=8, type=int,
-                    help="Batch size per GPU/CPU for evaluation.")
-parser.add_argument("--no_cuda", action='store_true',
-                    help="Whether not to use CUDA when available")
-parser.add_argument('--chinese', action='store_true', help="Chinese")
-parser.add_argument('--toSimplified', action='store_true', help="to simplified chinese")
-parser.add_argument('--k1', type=float, default=0.9,
-                    help='bm25 parameter')
-parser.add_argument('--b', type=float, default=0.4,
-                    help='bm25 parameter')
-parser.add_argument('--rm3', action="store_true",
-                    help='wether use rm3 ranker')
-parser.add_argument('--index_path', type=str,
-                    help='Path to index file')
-parser.add_argument('--para_num', type=int,
-                    help='number of top k paragraphs into bert')
-parser.add_argument('--linking', action="store_true", default=False)
-parser.add_argument('--link_doc_score', type=float, default=30)
-parser.add_argument('--output_fn', type=str, default="output.json",
-                    help='output file name')
-
-parser.add_argument(
-    "--model_type",
-    default=None,
-    type=str,
-    required=True,
-    #help="Model type selected in the list: " + ", ".join(MODEL_TYPES),
-)
-parser.add_argument(
-    "--model_name_or_path",
-    default=None,
-    type=str,
-    required=True,
-    help="Path to pretrained model or model identifier from huggingface.co/models",
-)
-parser.add_argument(
-    "--output_dir",
-    default=None,
-    type=str,
-    required=True,
-    help="The output directory where the model checkpoints and predictions will be written.",
-)
-
-# Other parameters
-parser.add_argument(
-    "--data_dir",
-    default=None,
-    type=str,
-    help="The input data dir. Should contain the .json files for the task."
-         + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
-)
-parser.add_argument(
-    "--train_file",
-    default=None,
-    type=str,
-    help="The input training file. If a data dir is specified, will look for the file there"
-         + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
-)
-parser.add_argument(
-    "--predict_file",
-    default=None,
-    type=str,
-    help="The input evaluation file. If a data dir is specified, will look for the file there"
-         + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
-)
-parser.add_argument(
-    "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-)
-parser.add_argument(
-    "--tokenizer_name",
-    default="",
-    type=str,
-    help="Pretrained tokenizer name or path if not the same as model_name",
-)
-parser.add_argument(
-    "--cache_dir",
-    default="",
-    type=str,
-    help="Where do you want to store the pre-trained models downloaded from s3",
-)
-
-parser.add_argument(
-    "--version_2_with_negative",
-    action="store_true",
-    help="If true, the SQuAD examples contain some that do not have an answer.",
-)
-parser.add_argument(
-    "--null_score_diff_threshold",
-    type=float,
-    default=0.0,
-    help="If null_score - best_non_null is greater than the threshold predict null.",
-)
-
-parser.add_argument(
-    "--max_seq_length",
-    default=384,
-    type=int,
-    help="The maximum total input sequence length after WordPiece tokenization. Sequences "
-         "longer than this will be truncated, and sequences shorter than this will be padded.",
-)
-parser.add_argument(
-    "--doc_stride",
-    default=128,
-    type=int,
-    help="When splitting up a long document into chunks, how much stride to take between chunks.",
-)
-parser.add_argument(
-    "--max_query_length",
-    default=64,
-    type=int,
-    help="The maximum number of tokens for the question. Questions longer than this will "
-         "be truncated to this length.",
-)
-parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-parser.add_argument(
-    "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
-)
-parser.add_argument(
-    "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-)
-
-parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
-parser.add_argument(
-    "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
-)
-parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-parser.add_argument(
-    "--gradient_accumulation_steps",
-    type=int,
-    default=1,
-    help="Number of updates steps to accumulate before performing a backward/update pass.",
-)
-parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-parser.add_argument(
-    "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
-)
-parser.add_argument(
-    "--max_steps",
-    default=-1,
-    type=int,
-    help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-)
-parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-parser.add_argument(
-    "--n_best_size",
-    default=20,
-    type=int,
-    help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
-)
-parser.add_argument(
-    "--max_answer_length",
-    default=30,
-    type=int,
-    help="The maximum length of an answer that can be generated. This is needed because the start "
-         "and end predictions are not conditioned on one another.",
-)
-parser.add_argument(
-    "--verbose_logging",
-    action="store_true",
-    help="If true, all of the warnings related to data processing will be printed. "
-         "A number of warnings are expected for a normal SQuAD evaluation.",
-)
-parser.add_argument(
-    "--lang_id",
-    default=0,
-    type=int,
-    help="language id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)",
-)
-
-parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
-parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
-parser.add_argument(
-    "--eval_all_checkpoints",
-    action="store_true",
-    help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-)
-parser.add_argument(
-    "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
-)
-parser.add_argument(
-    "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-)
-parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
-parser.add_argument(
-    "--fp16",
-    action="store_true",
-    help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-)
-parser.add_argument(
-    "--fp16_opt_level",
-    type=str,
-    default="O1",
-    help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
-         "See details at https://nvidia.github.io/apex/amp.html",
-)
-parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
-parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
-
-parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features")
-
-args = parser.parse_args()
-
-pprint.pprint(vars(args))
-
-if args.local_rank == -1 or args.no_cuda:
-    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-    args.n_gpu = torch.cuda.device_count()
-else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-    torch.cuda.set_device(args.local_rank)
-    device = torch.device("cuda", args.local_rank)
-    torch.distributed.init_process_group(backend='nccl')
-    args.n_gpu = 1
-args.device = device
-
diff --git a/bertserini/reader/bert_reader.py b/bertserini/reader/bert_reader.py
index c8cb6af..a9a602d 100644
--- a/bertserini/reader/bert_reader.py
+++ b/bertserini/reader/bert_reader.py
@@ -9,7 +9,7 @@
 
 __all__ = ['BERT']
 
-from bertserini.run_squad import to_list
+from bertserini.train.run_squad import to_list
 
 from bertserini.utils.utils_squad import SquadExample, compute_predictions_logits
 
diff --git a/bertserini/train/__init__.py b/bertserini/train/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/bertserini/run_squad.py b/bertserini/train/run_squad.py
similarity index 99%
rename from bertserini/run_squad.py
rename to bertserini/train/run_squad.py
index 782dae0..e24b6e3 100644
--- a/bertserini/run_squad.py
+++ b/bertserini/train/run_squad.py
@@ -52,8 +52,6 @@
 except ImportError:
     from tensorboardX import SummaryWriter
 
-# from args import *
-
 logger = logging.getLogger(__name__)
 
 MODEL_CONFIG_CLASSES = list(MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys())

From 4e72b578d91f021e707cb38bc4c83a1f809af74d Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Tue, 22 Sep 2020 17:39:44 +0800
Subject: [PATCH 14/50] replace hardcode args by argparse

---
 bertserini/experiments/args.py                | 52 +++++++++++++++++++
 bertserini/experiments/experiment_cmrc.py     | 27 ----------
 .../{experiment_squad.py => inference.py}     | 12 ++---
 3 files changed, 58 insertions(+), 33 deletions(-)
 create mode 100644 bertserini/experiments/args.py
 delete mode 100644 bertserini/experiments/experiment_cmrc.py
 rename bertserini/experiments/{experiment_squad.py => inference.py} (66%)

diff --git a/bertserini/experiments/args.py b/bertserini/experiments/args.py
new file mode 100644
index 0000000..376790e
--- /dev/null
+++ b/bertserini/experiments/args.py
@@ -0,0 +1,52 @@
+import argparse
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--dataset_path",
+    default=None,
+    type=str,
+    required=True,
+    help="Path to the [dev, test] dataset",
+)
+
+parser.add_argument(
+    "--index_path",
+    default=None,
+    type=str,
+    required=True,
+    help="Path to the indexes of contexts",
+)
+parser.add_argument(
+    "--model_name_or_path",
+    default=None,
+    type=str,
+    required=True,
+    help="Path to pretrained model or model identifier from huggingface.co/models",
+)
+parser.add_argument(
+    "--tokenizer_name",
+    default="",
+    type=str,
+    help="Pretrained tokenizer name or path if not the same as model_name",
+)
+parser.add_argument(
+    "--output",
+    default=None,
+    type=str,
+    required=True,
+    help="The output file where the runs results will be written to",
+)
+parser.add_argument(
+    "--language",
+    default="en",
+    type=str,
+    help="The language of task",
+)
+parser.add_argument(
+    "--topk",
+    default=10,
+    type=int,
+    help="The number of contexts retrieved for a question",
+)
+args = parser.parse_args()
\ No newline at end of file
diff --git a/bertserini/experiments/experiment_cmrc.py b/bertserini/experiments/experiment_cmrc.py
deleted file mode 100644
index ee697ef..0000000
--- a/bertserini/experiments/experiment_cmrc.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import json
-from tqdm import tqdm
-from bertserini.reader.bert_reader import BERT
-from bertserini.retriever.pyserini_retriever import retriever, build_searcher
-from bertserini.utils.utils_new import extract_squad_questions
-
-if __name__ == "__main__":
-
-    questions = extract_squad_questions("data/cmrc_dev_squad.json")
-    bert_reader = BERT("rsvp-ai/bertserini-bert-base-cmrc", "rsvp-ai/bertserini-bert-base-cmrc")
-    searcher = build_searcher("index/lucene-index.wiki_zh_paragraph_with_title_0.6.0.pos+docvectors", language="zh")
-
-    all_answer = []
-    for question in tqdm(questions):
-        contexts = retriever(question, searcher, 10)
-        final_answers = bert_reader.predict(question, contexts)
-        final_answers_lst = []
-        for ans in final_answers:
-            final_answers_lst.append(
-                {"id": question.id,
-                 "answer": ans.text,
-                 "phrase_score": ans.score,
-                 "paragraph_score": ans.ctx_score,
-                 }
-            )
-        all_answer.append(final_answers_lst)
-    json.dump(all_answer, open("result_cmrc.json", 'w'), indent=4)
diff --git a/bertserini/experiments/experiment_squad.py b/bertserini/experiments/inference.py
similarity index 66%
rename from bertserini/experiments/experiment_squad.py
rename to bertserini/experiments/inference.py
index 84ed298..de6d042 100644
--- a/bertserini/experiments/experiment_squad.py
+++ b/bertserini/experiments/inference.py
@@ -3,16 +3,16 @@
 from bertserini.reader.bert_reader import BERT
 from bertserini.retriever.pyserini_retriever import retriever, build_searcher
 from bertserini.utils.utils_new import extract_squad_questions
+from bertserini.experiments.args import *
 
 if __name__ == "__main__":
-
-    questions = extract_squad_questions("data/dev-v1.1.json")
-    bert_reader = BERT("rsvp-ai/bertserini-bert-base-squad", "rsvp-ai/bertserini-bert-base-squad")
-    searcher = build_searcher("index/lucene-index.enwiki-20180701-paragraphs")
+    questions = extract_squad_questions(args.dataset_path)
+    bert_reader = BERT(args.model_name_or_path, args.tokenizer_name)
+    searcher = build_searcher(args.index_path, language=args.language)
 
     all_answer = []
     for question in tqdm(questions):
-        contexts = retriever(question, searcher, 10)
+        contexts = retriever(question, searcher, args.topk)
         final_answers = bert_reader.predict(question, contexts)
         final_answers_lst = []
         for ans in final_answers:
@@ -24,5 +24,5 @@
                  }
             )
         all_answer.append(final_answers_lst)
-    json.dump(all_answer, open("result_bert_base.json", 'w'), indent=4)
+    json.dump(all_answer, open(args.output, 'w'), indent=4)
 

From 45bd735d43b5991362cdfd6d058e6e5b2241d72a Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Tue, 22 Sep 2020 17:41:47 +0800
Subject: [PATCH 15/50] remove some scripts that won't be used in the future

---
 scripts/demo.sh           | 14 --------------
 scripts/demo_cmrc.sh      | 15 ---------------
 scripts/eval.sh           | 10 ----------
 scripts/eval_cmrc.sh      | 10 ----------
 scripts/inference.sh      | 15 ---------------
 scripts/inference_cmrc.sh | 16 ----------------
 scripts/interactive.sh    | 13 -------------
 7 files changed, 93 deletions(-)
 delete mode 100644 scripts/demo.sh
 delete mode 100644 scripts/demo_cmrc.sh
 delete mode 100755 scripts/eval.sh
 delete mode 100755 scripts/eval_cmrc.sh
 delete mode 100644 scripts/inference.sh
 delete mode 100644 scripts/inference_cmrc.sh
 delete mode 100644 scripts/interactive.sh

diff --git a/scripts/demo.sh b/scripts/demo.sh
deleted file mode 100644
index 76b7d77..0000000
--- a/scripts/demo.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-SQUAD_DIR=~/00_data/squad_v1.1/
-
-python interactive.py \
-    --para_num 10 \
-    --index_path index/lucene-index.enwiki-20180701-paragraphs \
-    --model_type bert \
-    --model_name_or_path rsvp-ai/bertserini-bert-base-squad \
-    --do_eval \
-    --do_lower_case \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --predict_file $SQUAD_DIR/dev-v1.1.json \
-    --eval_batch_size=10 \
-    --output_dir=./demo
diff --git a/scripts/demo_cmrc.sh b/scripts/demo_cmrc.sh
deleted file mode 100644
index 6a0a110..0000000
--- a/scripts/demo_cmrc.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-SQUAD_DIR=/data/y247xie/00_data/cmrc2018/data
-
-python ./interactive.py \
-    --para_num 10 \
-    --index_path index/lucene-index.wiki_zh_paragraph_with_title_0.6.0.pos+docvectors \
-    --model_type bert \
-    --model_name_or_path rsvp-ai/bertserini-bert-base-cmrc \
-    --do_eval \
-    --do_lower_case \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --predict_file $SQUAD_DIR/cmrc_dev_to_squad.json \
-    --eval_batch_size=10 \
-    --output_dir=./demo_cmrc \
-    --chinese
diff --git a/scripts/eval.sh b/scripts/eval.sh
deleted file mode 100755
index ba87f5e..0000000
--- a/scripts/eval.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-DIR=./results/bert_base_squad
-FILE=test_inference.json
-DATA_PATH=/data/yqxie/00_data/squad_v1.1/dev-v1.1.json
-
-python aggregate.py \
-    --eval_data ${DATA_PATH} \
-    --search_file ${DIR}/${FILE} \
-    --output_path ${DIR} \
-    --dataset squad
-
diff --git a/scripts/eval_cmrc.sh b/scripts/eval_cmrc.sh
deleted file mode 100755
index 1ebc388..0000000
--- a/scripts/eval_cmrc.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-DIR=./results
-FILE=test_inference_cmrc.json
-DATA_PATH=/data/yqxie/00_data/cmrc2018/data/cmrc2018_dev.json
-
-python aggregate.py \
-    --eval_data ${DATA_PATH} \
-    --search_file ${DIR}/${FILE} \
-    --output_path ${DIR} \
-    --dataset cmrc
-
diff --git a/scripts/inference.sh b/scripts/inference.sh
deleted file mode 100644
index 4e419b3..0000000
--- a/scripts/inference.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-SQUAD_DIR=~/00_data/squad_v1.1/
-
-python ./search.py \
-    --para_num 10 \
-    --index_path index/lucene-index.enwiki-20180701-paragraphs \
-    --model_type bert \
-    --model_name_or_path rsvp-ai/bertserini-bert-base-squad \
-    --do_eval \
-    --do_lower_case \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --predict_file $SQUAD_DIR/dev-v1.1.json \
-    --eval_batch_size=32 \
-    --output_fn result_bert_base.json \
-    --output_dir tmp/bert_base_squad
diff --git a/scripts/inference_cmrc.sh b/scripts/inference_cmrc.sh
deleted file mode 100644
index 10e786e..0000000
--- a/scripts/inference_cmrc.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-SQUAD_DIR=/data/y247xie/00_data/cmrc2018/data
-
-python ./search.py \
-    --para_num 10 \
-    --index_path index/lucene-index.wiki_zh_paragraph_with_title_0.6.0.pos+docvectors \
-    --model_type bert \
-    --model_name_or_path rsvp-ai/bertserini-bert-base-cmrc \
-    --do_eval \
-    --do_lower_case \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --predict_file $SQUAD_DIR/cmrc_dev_to_squad.json \
-    --eval_batch_size=32 \
-    --output_fn test_inference_cmrc.json \
-    --output_dir tmp/cmrc_base_cmrc \
-    --chinese
diff --git a/scripts/interactive.sh b/scripts/interactive.sh
deleted file mode 100644
index 4a32b08..0000000
--- a/scripts/interactive.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-SQUAD_DIR=~/00_data/squad_v1.1/
-
-python ./interactive.py \
-    --para_num 10 \
-    --index_path index/lucene-index.enwiki-20180701-paragraphs \
-    --model_type bert \
-    --model_name_or_path ./tmp/bert_base_squad \
-    --do_eval \
-    --do_lower_case \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --predict_file $SQUAD_DIR/dev-v1.1.json \
-    --eval_batch_size=32 

From fde68cf096400835f5ba41aa54ea4e50d71734b8 Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Tue, 22 Sep 2020 17:50:19 +0800
Subject: [PATCH 16/50] improve repo structure

---
 docs/experiments-cmrc.md  | 0
 docs/experiments-squad.md | 0
 indexes/.gitkeep          | 1 +
 logs/.gitkeep             | 1 +
 models/.gitkeep           | 1 +
 predictions/.gitkeep      | 1 +
 tests/.gitkeep            | 1 +
 7 files changed, 5 insertions(+)
 create mode 100644 docs/experiments-cmrc.md
 create mode 100644 docs/experiments-squad.md
 create mode 100644 indexes/.gitkeep
 create mode 100644 logs/.gitkeep
 create mode 100644 models/.gitkeep
 create mode 100644 predictions/.gitkeep
 create mode 100644 tests/.gitkeep

diff --git a/docs/experiments-cmrc.md b/docs/experiments-cmrc.md
new file mode 100644
index 0000000..e69de29
diff --git a/docs/experiments-squad.md b/docs/experiments-squad.md
new file mode 100644
index 0000000..e69de29
diff --git a/indexes/.gitkeep b/indexes/.gitkeep
new file mode 100644
index 0000000..b4509c2
--- /dev/null
+++ b/indexes/.gitkeep
@@ -0,0 +1 @@
+# This is the default directory for models. Placeholder so that directory is kept in git.
\ No newline at end of file
diff --git a/logs/.gitkeep b/logs/.gitkeep
new file mode 100644
index 0000000..b4509c2
--- /dev/null
+++ b/logs/.gitkeep
@@ -0,0 +1 @@
+# This is the default directory for models. Placeholder so that directory is kept in git.
\ No newline at end of file
diff --git a/models/.gitkeep b/models/.gitkeep
new file mode 100644
index 0000000..b4509c2
--- /dev/null
+++ b/models/.gitkeep
@@ -0,0 +1 @@
+# This is the default directory for models. Placeholder so that directory is kept in git.
\ No newline at end of file
diff --git a/predictions/.gitkeep b/predictions/.gitkeep
new file mode 100644
index 0000000..b4509c2
--- /dev/null
+++ b/predictions/.gitkeep
@@ -0,0 +1 @@
+# This is the default directory for models. Placeholder so that directory is kept in git.
\ No newline at end of file
diff --git a/tests/.gitkeep b/tests/.gitkeep
new file mode 100644
index 0000000..b4509c2
--- /dev/null
+++ b/tests/.gitkeep
@@ -0,0 +1 @@
+# This is the default directory for models. Placeholder so that directory is kept in git.
\ No newline at end of file

From b5451d58a2ab3a6cc83e4b30ea5e550b5f69da81 Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Tue, 22 Sep 2020 18:51:04 +0800
Subject: [PATCH 17/50] add experiments documents

---
 README.md                 | 158 ++++++++------------------------------
 docs/experiments-cmrc.md  |  73 ++++++++++++++++++
 docs/experiments-squad.md |  79 +++++++++++++++++++
 3 files changed, 184 insertions(+), 126 deletions(-)

diff --git a/README.md b/README.md
index 47000d1..2395b11 100644
--- a/README.md
+++ b/README.md
@@ -12,151 +12,57 @@ Following the Open Domain QA setting of DrQA, we are using Wikipedia as the larg
 
 # Quick Start
 
-1. [Install dependencies](#install-dependencies)
+1. [Install dependencies](#package-installation)
 2. [Download the PreBuilt Wikipedia index](#download-prebuilt-wikipedia-index)
 3. [Download the pretrained models](#download-the-pretrained-models)
 4. [Quickly start the Demo](#start-the-demo)
 
 
-## Install dependencies
-
-BERTserini requires Python 3.5+ and a couple Python dependencies. The repo is tested on Python 3.6, Cuda 10.1, PyTorch 1.5.1 on Tesla P40 GPUs.
-Besides that, [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/) is recommended for convinence. Please run the following commands to install the Python dependencies. 
-
+## Package Installation
 ```
-conda create -n bertserini
-conda activate bertserini
-conda install tqdm
-pip install pyserini
-pip install transformers 
-pip install torch==1.5.1+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html # or install torch according to your cuda version
-pip install tensorboardX
-pip install hanziconv # for chinese processing
+pip install bertserini
 ```
 
+## Development Installation
+BERTserini requires Python 3.6+ and a couple Python dependencies. 
+The repo is tested on Python 3.6, Cuda 10.1, PyTorch 1.5.1 on Tesla P40 GPUs.
+Besides that, [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/) is recommended for convinence. Please run the following commands to install the Python dependencies. 
+1. Clone the repo with ```git clone https://github.com/rsvp-ai/bertserini.git```
+2. ```pip install -r requirements.txt```
+
 NOTE: Pyserini is the Python wrapper for Anserini. 
 Please refer to their project [Pyserini](https://github.com/castorini/pyserini) for detailed usage. Also, Pyserini supports part of the features in Anserini; you can also refer to [Anserini](https://github.com/castorini/anserini) for more settings.
 
 
-## Download PreBuilt Wikipedia Index
-
-We have indexed the 20180701 Wikipedia dump used in DrQA with Anserini; you can download the prepared index here:
-```
-wget ftp://72.143.107.253/BERTserini/english_wiki_2018_index.zip
-````
-For the chinese index, please download through:
-```
-wget ftp://72.143.107.253/BERTserini/chinese_wiki_2018_index.zip
-```
-```*index.zip``` contains the indexed latest Wikipedia dump with Anserini.
-
-After unzipping these files, put them under the root path of this repo, and then you are ready to go.
-Take the following folder structure as an example:
-```
-bertserini
-+--- index
-|    +--- lucene-index.enwiki-20180701-paragraphs
-|    |    +--- ...
-+--- other files under this repo
-```
-
-## Download the pre-trained models
-
-We have uploaded the finetuned checkpoints to the huggingface models: \
-[bertserini-bert-base-squad](https://huggingface.co/rsvp-ai/bertserini-bert-base-squad) \
-[bertserini-bert-large-squad](https://huggingface.co/rsvp-ai/bertserini-bert-large-squad) \
-[bertserini-bert-base-cmrc](https://huggingface.co/rsvp-ai/bertserini-bert-base-cmrc) # this is for Chinese \
-[bertserini-roberta-base](https://huggingface.co/rsvp-ai/bertserini-roberta-base)
-
-To run our finetuned model, just set ```--model_name_or_path rsvp-ai/<MODEL_NAME>``` for example: ```--model_name_or_path rsvp-ai/bertserini-bert-large-squad```.
-
-We also provide the Chinese version of this pipeline on [CMRC](https://github.com/ymcui/cmrc2018) and [DRCD](https://github.com/DRCKnowledgeTeam/DRCD) datasets. 
-
-## Start the Demo
-
-To quickly try out the system, you should follow ```demo.sh``` to set the paths, then
-```
-bash demo.sh
-``` 
-or
-```
-bash demo_cmrc.sh
-```
-You can try our fine-tuned model with the Wikipedia articles.
-
-# Training, inferencing and evaluating using your data
-
-You may use your data on this system; we provide the steps based on the SQuAD dataset.
+## A Simple Question-Answer Example
+```python
+from bertserini.reader.base import Question, Context
+from bertserini.reader.bert_reader import BERT
+from bertserini.utils.utils_new import get_best_answer
 
-## Prepare index files:
-To get the index on your corpus, please refer to [Pyserini](https://github.com/castorini/pyserini#how-do-i-search-my-own-documents). 
+model_name = "rsvp-ai/bertserini-bert-base-squad"
+tokenizer_name = "rsvp-ai/bertserini-bert-base-squad"
+bert_reader = BERT(model_name, tokenizer_name)
 
-After getting the index, put it under the path ```bertserini/index/```
+# Here is our question:
+question = Question("Why did Mark Twain call the 19th century the glied age?")
 
-## Prepare the training datasets:
+# Option 1: fetch some contexts from Wikipedia with Pyserini
+from bertserini.retriever.pyserini_retriever import retriever, build_searcher
+searcher = build_searcher("/path/to/enwiki/index/")
+contexts = retriever(question, searcher, 10)
 
-```
-wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
-wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json
-```
-
-For CMRC dataset, please refer to [CMRC](https://github.com/ymcui/cmrc2018).
-TODO: for DRCD dataset, which is in Traditional Chinese, we have implemented the to-simplified argment, however, the results has not been tested.
-
-## Training
-Please set the correct parameters in the following script and then run.
-```
-bash train.sh
-```
-or, for chinese:
-```
-bash train_cmrc.sh
-```
-
-This script will run the training for BERT on the SQuAD dataset.
-It will generate checkpoints under ./tmp/ \
-You can also train the base model from other pre-trained models as long as it is already supporting Question Answering. 
+# Option 2: hard-coded contexts
+contexts = [Context('The "Gilded Age" was a term that Mark Twain used to describe the period of the late 19th century when there had been a dramatic expansion of American wealth and prosperity.')]
 
-## Inferencing SQuAD under the open-domain setting
+# Either option, we can ten get the answer candidates by reader
+# and then select out the best answer based on the linear 
+# combination of context score and phase score
+candidates = bert_reader.predict(question, contexts)
+answer = get_best_answer(candidates, 0.45)
+print(answer.text)
 
-Set the checkpoint information in the below script, according to the path after training. \
-We have upload the finetuned checkpoints to hugging face, you can directly use them with the argments```--model_name_or_path rsvp-ai/bertserini-bert-base-squad``` or ```--model_name_or_path rsvp-ai/bertserini-bert-large-squad``` in the ```inference.sh``` script. \ 
-Then run:
-```
-bash inference.sh
-```
-or, for chinese:
-```
-inference_cmrc.sh #this added --chinese argments to switch to chinese example processing
 ```
-It will generate inference results on SQuAD, under the path of ./results/
-
-## Evaluation
-Set the result paths according to the inference result path
-```
-bash eval.sh
-```
-This script will first automatically select the parameter to aggregate paragraph score (from Pyserini) and phrase score (from BERT), and finally, select the best parameter and print the evaluation matrixs.
-```
-# expected results:
-
-## rsvp-ai/bertserini-large-squad, this is finetuned based on bert-large-wwm-uncased
-(0.4, {'exact_match': 41.54210028382214, 'f1': 49.45378799697662, 'recall': 51.119838584003105, 'precision': 49.8395951713666, 'cover': 47.228003784295176, 'overlap': 57.6631977294229})
-
-## rsvp-ai/bertserini-bert-base-squad, this is finetuned based on bert-base-uncased
-(0.5, {'exact_match': 39.89593188268685, 'f1': 47.58710784120026, 'recall': 49.27586877280707, 'precision': 48.10849111109448, 'cover': 45.31693472090823, 'overlap': 56.00756859035005})
-
-## rsvp-ai/bertserini-bert-base-cmrc, this is bert-base-chinese finetuned on the chinese reading comprehension dataset(CMRC)
-(0.5, {'f1_score': 68.0033167812909, 'exact_match': 51.164958061509786, 'total_count': 3219, 'skip_count': 1})
-```
-
-## Notes
-
-We also provide the code to run with Anseirni's indexing version. \
-This requires .jar files from compiled [Anserini](https://github.com/castorini/anserini). \
-You can look into Anserini's repo and modify the code for you own needs. \
-And then swithch to the API connecting Anserini provided in ./retriever/anserini_retriever.py #TODO: swithch to argument setting
-
 
 ## Citation
 
diff --git a/docs/experiments-cmrc.md b/docs/experiments-cmrc.md
index e69de29..ab63f13 100644
--- a/docs/experiments-cmrc.md
+++ b/docs/experiments-cmrc.md
@@ -0,0 +1,73 @@
+# Bertserini: Baseline on CMRC QA (in Chinese)
+
+1. Clone the repo with ```git clone https://github.com/rsvp-ai/bertserini.git```
+2. ```pip install -r requirements.txt```
+
+## Download PreBuilt Wikipedia Index
+
+We have indexed the 2018 Wikipedia Chinese dump. You can download the prepared index here:
+```
+wget ftp://72.143.107.253/BERTserini/chinese_wiki_2018_index.zip
+```
+```*index.zip``` contains the indexed latest Wikipedia dump with Anserini.
+
+After unzipping these files, put them under the root path of this repo, and then you are ready to go.
+Take the following folder structure as an example:
+```
+bertserini
++--- indexes
+|    +--- lucene-index.wiki_zh_paragraph_with_title_0.6.0.pos+docvectors
+|    |    +--- ...
++--- other files under this repo
+```
+
+## Download the pre-trained models
+
+We have uploaded the finetuned checkpoints to the huggingface models: \
+[bertserini-bert-base-cmrc](https://huggingface.co/rsvp-ai/bertserini-bert-base-cmrc)
+
+
+To run our finetuned model, just set ```--model_name_or_path rsvp-ai/<MODEL_NAME>```.  
+For example: ```--model_name_or_path rsvp-ai/bertserini-bert-base-cmrc```.
+
+# Inferencing and evaluating
+
+## Prepare the datasets:
+
+```
+cd data
+wget https://worksheets.codalab.org/rest/bundles/0xb70e5e281fcd437d9aa8f1c4da107ae4/contents/blob/
+mv index.html cmrc2018_dev.json
+wget https://worksheets.codalab.org/rest/bundles/0x72252619f67b4346a85e122049c3eabd/contents/blob/
+mv index.html cmrc2018_dev_squad.json
+```
+
+## Inferencing CMRC under the open-domain setting
+For `rsvp-ai/bertserini-bert-base-cmrc`
+```
+python -m bertserini.experiments.inference --dataset_path data/cmrc2018_dev_squad.json \
+                                           --index_path indexes/lucene-index.wiki_zh_paragraph_with_title_0.6.0.pos+docvectors \
+                                           --model_name_or_path rsvp-ai/bertserini-bert-base-cmrc \
+                                           --output prediction/cmrc2018_pred.json
+                                           --topk 10
+                                           --language zh
+
+```
+
+## Evaluation
+
+```
+mkdir temp
+pyhton -m bertserini.experiments.evaluate --eval_data data/cmrc2018_dev.json \
+                                          --search_file prediction/cmrc2018_pred.json \
+                                          --output_path temp \
+                                          --dataset cmrc
+                                          
+```
+
+Expected results:
+
+```
+## rsvp-ai/bertserini-bert-base-cmrc, this is bert-base-chinese finetuned on the chinese reading comprehension dataset(CMRC)
+(0.5, {'f1_score': 68.0033167812909, 'exact_match': 51.164958061509786, 'total_count': 3219, 'skip_count': 1})
+```
\ No newline at end of file
diff --git a/docs/experiments-squad.md b/docs/experiments-squad.md
index e69de29..105b67d 100644
--- a/docs/experiments-squad.md
+++ b/docs/experiments-squad.md
@@ -0,0 +1,79 @@
+# Bertserini: Baseline on SQUAD QA
+
+1. Clone the repo with ```git clone https://github.com/rsvp-ai/bertserini.git```
+2. ```pip install -r requirements.txt```
+
+## Download PreBuilt Wikipedia Index
+
+We have indexed the 20180701 Wikipedia dump used in DrQA with Anserini; you can download the prepared index here:
+```
+wget ftp://72.143.107.253/BERTserini/english_wiki_2018_index.zip
+````
+```*index.zip``` contains the indexed latest Wikipedia dump with Anserini.
+
+After unzipping these files, put them under the root path of this repo, and then you are ready to go.
+Take the following folder structure as an example:
+```
+bertserini
++--- indexes
+|    +--- lucene-index.enwiki-20180701-paragraphs
++--- other files under this repo
+```
+
+## Download the pre-trained models
+
+We have uploaded the finetuned checkpoints to the huggingface models: \
+[bertserini-bert-base-squad](https://huggingface.co/rsvp-ai/bertserini-bert-base-squad) \
+[bertserini-bert-large-squad](https://huggingface.co/rsvp-ai/bertserini-bert-large-squad) \
+[bertserini-roberta-base](https://huggingface.co/rsvp-ai/bertserini-roberta-base)
+
+To run our finetuned model, just set ```--model_name_or_path rsvp-ai/<MODEL_NAME>```   
+For example: ```--model_name_or_path rsvp-ai/bertserini-bert-large-squad```.
+
+# Inferencing and evaluating
+
+## Prepare the datasets:
+
+```
+cd data
+wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json
+```
+
+## Inferencing SQuAD under the open-domain setting
+For `rsvp-ai/bertserini-bert-base-squad`
+```
+python -m bertserini.experiments.inference --dataset_path data/dev-v1.1.json \
+                                           --index_path indexes/lucene-index.enwiki-20180701-paragraphs \
+                                           --model_name_or_path rsvp-ai/bertserini-bert-base-squad \
+                                           --output squad_bert_base_pred.json
+                                           --topk 10
+
+```
+
+For `rsvp-ai/bertserini-bert-large-squad`
+```
+python -m bertserini.experiments.inference --dataset_path data/dev-v1.1.json \
+                                           --index_path indexes/lucene-index.enwiki-20180701-paragraphs \
+                                           --model_name_or_path rsvp-ai/bertserini-bert-large-squad \
+                                           --output prediction/squad_bert_large_pred.json
+                                           --topk 10
+
+```
+## Evaluation
+
+```
+mkdir temp
+pyhton -m bertserini.experiments.evaluate --eval_data data/dev-v1.1.json \
+                                          --search_file prediction/squad_bert_large_pred.json \
+                                          --output_path temp \
+                                          --dataset squad
+                                          
+```
+Expected results:
+```
+## rsvp-ai/bertserini-large-squad, this is finetuned based on bert-large-wwm-uncased
+(0.4, {'exact_match': 41.54210028382214, 'f1': 49.45378799697662, 'recall': 51.119838584003105, 'precision': 49.8395951713666, 'cover': 47.228003784295176, 'overlap': 57.6631977294229})
+
+## rsvp-ai/bertserini-bert-base-squad, this is finetuned based on bert-base-uncased
+(0.5, {'exact_match': 40.179754020813625, 'f1': 47.828056659017584, 'recall': 49.517951036176, 'precision': 48.3495034100538, 'cover': 45.50614947965941, 'overlap': 56.20624408703879})
+```
\ No newline at end of file

From 49b7301b8728a491e02826b7dfb93af7e4c58761 Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Tue, 22 Sep 2020 19:11:21 +0800
Subject: [PATCH 18/50] fix typo in doc

---
 README.md                            | 8 --------
 {predictions => prediction}/.gitkeep | 0
 2 files changed, 8 deletions(-)
 rename {predictions => prediction}/.gitkeep (100%)

diff --git a/README.md b/README.md
index 2395b11..02fcbd8 100644
--- a/README.md
+++ b/README.md
@@ -10,14 +10,6 @@ We demonstrate an end-to-end Open-Domain question answering system that integrat
 
 Following the Open Domain QA setting of DrQA, we are using Wikipedia as the large scale knowledge source of documents. The system first retrieves several candidate text segmentations among the entire knowledge source of documents, then read through the candidate text segments to determine the answers.
 
-# Quick Start
-
-1. [Install dependencies](#package-installation)
-2. [Download the PreBuilt Wikipedia index](#download-prebuilt-wikipedia-index)
-3. [Download the pretrained models](#download-the-pretrained-models)
-4. [Quickly start the Demo](#start-the-demo)
-
-
 ## Package Installation
 ```
 pip install bertserini
diff --git a/predictions/.gitkeep b/prediction/.gitkeep
similarity index 100%
rename from predictions/.gitkeep
rename to prediction/.gitkeep

From 6b2e3e7402368b4f8274819b8ae67cdf75ce14b8 Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Fri, 25 Sep 2020 02:19:47 +0800
Subject: [PATCH 19/50] bug fix

---
 bertserini/utils/utils_new.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bertserini/utils/utils_new.py b/bertserini/utils/utils_new.py
index 6c3bef4..1e6c6e3 100644
--- a/bertserini/utils/utils_new.py
+++ b/bertserini/utils/utils_new.py
@@ -6,7 +6,7 @@
 def get_best_answer(candidates, weight=0.5):
     for ans in candidates:
         ans.aggregate_score(weight)
-    return candidates.sorted(key=lambda x: x.total_score, reverse=True)[0]
+    return sorted(candidates, key=lambda x: x.total_score, reverse=True)[0]
 
 
 def extract_squad_questions(squad_filename, language="en"):

From 07e8e3ef7d603bdfd09890006d759ad02c0733b6 Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Fri, 25 Sep 2020 02:33:51 +0800
Subject: [PATCH 20/50] update setup.py

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 74c4728..8d4862d 100644
--- a/setup.py
+++ b/setup.py
@@ -9,11 +9,11 @@
 setuptools.setup(
     name='bertserini',
     version='0.0.1',
-    packages=['bertserini', 'bertserini.eval', 'bertserini.retriever'],
+    packages=['bertserini', 'bertserini.reader', 'bertserini.retriever', 'bertserini.experiments', 'bertserini.utils'],
     url='https://github.com/rsvp-ai/bertserini',
     license='',
     author='bertserini',
-    author_email='x93ma@uwaterloo.ca',
+    author_email='yuqing.xie@uwaterloo.ca',
     description='An end-to-end Open-Domain question answering system',
     install_requires=requirements,
     classifiers=[

From 570eea81c36e03f8ec0a8945c4272b0be866abfa Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Fri, 25 Sep 2020 02:44:41 +0800
Subject: [PATCH 21/50] specify package versions

---
 requirements.txt | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 8f946e4..7562f84 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,8 @@
-tqdm
-numpy
-pyserini
-transformers
-torch
-torchvision
-tensorboardX
-hanziconv
\ No newline at end of file
+tqdm>=4.45.0
+numpy>=1.18.5
+pyserini==0.9.4.0
+transformers>=2.10.0
+torch==1.5.1+cu101
+torchvision==0.6.1+cu101
+tensorboardX>=2.1
+hanziconv>=0.3.2
\ No newline at end of file

From e57bce6e6d865756789b4c33e8e40c81f4d2b31e Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Fri, 25 Sep 2020 03:05:53 +0800
Subject: [PATCH 22/50] update README

---
 README.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/README.md b/README.md
index 02fcbd8..4d77b21 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,9 @@ Please refer to their project [Pyserini](https://github.com/castorini/pyserini)
 
 
 ## A Simple Question-Answer Example
+We provided an online interface to simply play with english QA [here](https://huggingface.co/rsvp-ai/bertserini-bert-base-squad?text=Where+do+I+live%3F&context=My+name+is+Sarah+and+I+live+in+London)
+
+Below is a example for English Question-Answering. We also provide an example for Chinese Question-Answering [here]().
 ```python
 from bertserini.reader.base import Question, Context
 from bertserini.reader.bert_reader import BERT
@@ -53,8 +56,20 @@ contexts = [Context('The "Gilded Age" was a term that Mark Twain used to describ
 candidates = bert_reader.predict(question, contexts)
 answer = get_best_answer(candidates, 0.45)
 print(answer.text)
+```
+
+NOTE:
 
+ The index we used above is English Wikipedia, which could be download via:
 ```
+wget ftp://72.143.107.253/BERTserini/english_wiki_2018_index.zip
+```
+
+After unzipping these file, we suggest you putting it in `indexes/`.
+
+We have uploaded following finetuned checkpoints to the huggingace models:\
+[bertserini-bert-base-squad](https://huggingface.co/rsvp-ai/bertserini-bert-base-squad) \
+[bertserini-bert-large-squad](https://huggingface.co/rsvp-ai/bertserini-bert-large-squad)
 
 ## Citation
 

From 48db4643d2d55a60d4c208578b0a96f9823beaee Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Fri, 25 Sep 2020 06:29:22 +0800
Subject: [PATCH 23/50] make bert args configurable

---
 bertserini/reader/bert_reader.py | 48 ++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/bertserini/reader/bert_reader.py b/bertserini/reader/bert_reader.py
index a9a602d..0b30a23 100644
--- a/bertserini/reader/bert_reader.py
+++ b/bertserini/reader/bert_reader.py
@@ -40,6 +40,26 @@ def __init__(self, model_name: str, tokenizer_name: str = None):
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.model = AutoModelForQuestionAnswering.from_pretrained(model_name).to(self.device).eval()
         self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=True)
+        self.args = {
+            "max_seq_length": 384,
+            "doc_stride": 128,
+            "max_query_length": 64,
+            "threads": 1,
+            "tqdm_enabled": False,
+            "n_best_size": 20,
+            "max_answer_length": 30,
+            "do_lower_case": True,
+            "output_prediction_file": False,
+            "output_nbest_file": None,
+            "output_null_log_odds_file": None,
+            "verbose_logging": False,
+            "version_2_with_negative": True,
+            "null_score_diff_threshold": 0,
+        }
+
+    def update_args(self, args_to_change):
+        for key in args_to_change:
+            self.args[key] = args_to_change[key]
 
     def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
         examples = craft_squad_examples(question, contexts)
@@ -47,13 +67,13 @@ def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
         features, dataset = squad_convert_examples_to_features(
             examples=examples,
             tokenizer=self.tokenizer,
-            max_seq_length=384,
-            doc_stride=128,
-            max_query_length=64,
+            max_seq_length=self.args["max_seq_length"],
+            doc_stride=self.args["doc_stride"],
+            max_query_length=self.args["max_query_length"],
             is_training=False,
             return_dataset="pt",
-            threads=1,
-            tqdm_enabled=False
+            threads=self.args["threads"],
+            tqdm_enabled=self.args["tqdm_enabled"]
         )
 
         # Note that DistributedSampler samples randomly
@@ -89,15 +109,15 @@ def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
             all_examples=examples,
             all_features=features,
             all_results=all_results,
-            n_best_size=20,
-            max_answer_length=30,
-            do_lower_case=True,
-            output_prediction_file=None,
-            output_nbest_file=None,
-            output_null_log_odds_file=None,
-            verbose_logging=False,
-            version_2_with_negative=False,
-            null_score_diff_threshold=0,
+            n_best_size=self.args["n_best_size"],
+            max_answer_length=self.args["max_answer_length"],
+            do_lower_case=self.args["do_lower_case"],
+            output_prediction_file=self.args["output_prediction_file"],
+            output_nbest_file=self.args["output_nbest_file"],
+            output_null_log_odds_file=self.args["output_null_log_odds_file"],
+            verbose_logging=self.args["verbose_logging"],
+            version_2_with_negative=self.args["version_2_with_negative"],
+            null_score_diff_threshold=self.args["null_score_diff_threshold"],
             tokenizer=self.tokenizer,
             language=question.language
         )

From 97dbf351d35515bc4bd6a4df3bf4983357193306 Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Fri, 25 Sep 2020 06:51:51 +0800
Subject: [PATCH 24/50] add document for chinese qa

---
 README.md             | 15 ++++++++++-----
 docs/qa_example_zh.md | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 5 deletions(-)
 create mode 100644 docs/qa_example_zh.md

diff --git a/README.md b/README.md
index 4d77b21..5b42268 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ Please refer to their project [Pyserini](https://github.com/castorini/pyserini)
 ## A Simple Question-Answer Example
 We provided an online interface to simply play with english QA [here](https://huggingface.co/rsvp-ai/bertserini-bert-base-squad?text=Where+do+I+live%3F&context=My+name+is+Sarah+and+I+live+in+London)
 
-Below is a example for English Question-Answering. We also provide an example for Chinese Question-Answering [here]().
+Below is a example for English Question-Answering. We also provide an example for Chinese Question-Answering [here](docs/qa_example_zh.md).
 ```python
 from bertserini.reader.base import Question, Context
 from bertserini.reader.bert_reader import BERT
@@ -44,7 +44,7 @@ question = Question("Why did Mark Twain call the 19th century the glied age?")
 
 # Option 1: fetch some contexts from Wikipedia with Pyserini
 from bertserini.retriever.pyserini_retriever import retriever, build_searcher
-searcher = build_searcher("/path/to/enwiki/index/")
+searcher = build_searcher("indexes/lucene-index.enwiki-20180701-paragraphs")
 contexts = retriever(question, searcher, 10)
 
 # Option 2: hard-coded contexts
@@ -68,9 +68,14 @@ wget ftp://72.143.107.253/BERTserini/english_wiki_2018_index.zip
 After unzipping these file, we suggest you putting it in `indexes/`.
 
 We have uploaded following finetuned checkpoints to the huggingace models:\
-[bertserini-bert-base-squad](https://huggingface.co/rsvp-ai/bertserini-bert-base-squad) \
-[bertserini-bert-large-squad](https://huggingface.co/rsvp-ai/bertserini-bert-large-squad)
-
+- [bertserini-bert-base-squad](https://huggingface.co/rsvp-ai/bertserini-bert-base-squad)
+- [bertserini-bert-large-squad](https://huggingface.co/rsvp-ai/bertserini-bert-large-squad)
+
+## Experiments
+We have evaluated our system on `SQuAD 1.1` and `CMRC2018` development set.
+Please see following documents for details:  
+- [SQuAD experiments](docs/experiments-squad.md)  
+- [CMRC experiments](docs/experiments-cmrc.md)
 ## Citation
 
 Please cite [the NAACL 2019 paper]((https://www.aclweb.org/anthology/N19-4013/)):
diff --git a/docs/qa_example_zh.md b/docs/qa_example_zh.md
new file mode 100644
index 0000000..6ad0694
--- /dev/null
+++ b/docs/qa_example_zh.md
@@ -0,0 +1,43 @@
+## A Simple Question-Answer Example (Chinese)
+
+```python
+from bertserini.reader.base import Question, Context
+from bertserini.reader.bert_reader import BERT
+from bertserini.utils.utils_new import get_best_answer
+
+language = "zh"
+model_name = "rsvp-ai/bertserini-bert-base-cmrc"
+tokenizer_name = "rsvp-ai/bertserini-bert-base-cmrc"
+bert_reader = BERT(model_name, tokenizer_name)
+
+# Here is our question:
+question = Question("《战国无双3》是由哪两个公司合作开发的？", language)
+
+# Option 1: fetch some contexts from Wikipedia with Pyserini
+from bertserini.retriever.pyserini_retriever import retriever, build_searcher
+searcher = build_searcher("indexes/lucene-index.wiki_zh_paragraph_with_title_0.6.0.pos+docvectors")
+contexts = retriever(question, searcher, 10)
+
+# Option 2: hard-coded contexts
+contexts = [Context('《战国无双3》（）是由光荣和ω-force开发的战国无双系列的正统第三续作。本作以三大故事为主轴，分别是以武田信玄等人为主的《关东三国志》，织田信长等人为主的《战国三杰》，石田三成等人为主的《关原的年轻武者》，丰富游戏内的剧情。此部份专门介绍角色，欲知武器情报、奥义字或擅长攻击类型等，请至战国无双系列1.由于乡里大辅先生因故去世，不得不寻找其他声优接手。从猛将传 and Z开始。2.战国无双 编年史的原创男女主角亦有专属声优。此模式是任天堂游戏谜之村雨城改编的新增模式。本作中共有20张战场地图（不含村雨城），后来发行的猛将传再新增3张战场地图。但游戏内战役数量繁多，部分地图会有兼用的状况，战役虚实则是以光荣发行的2本「战国无双3 人物真书」内容为主，以下是相关介绍。（注：前方加☆者为猛将传新增关卡及地图。）合并本篇和猛将传的内容，村雨城模式剔除，战国史模式可直接游玩。主打两大模式「战史演武」&「争霸演武」。系列作品外传作品')]
+
+# Either option, we can ten get the answer candidates by reader
+# and then select out the best answer based on the linear 
+# combination of context score and phase score
+candidates = bert_reader.predict(question, contexts)
+answer = get_best_answer(candidates, 0.45)
+print(answer.text)
+```
+
+
+NOTE:
+
+ The index we used above is Chinese Wikipedia, which can be download via:
+```
+wget ftp://72.143.107.253/BERTserini/chinese_wiki_2018_index.zip
+```
+
+After unzipping these file, we suggest you putting it in `indexes/`.
+
+We have uploaded following finetuned checkpoints to the huggingace models:\
+[bertserini-bert-base-cmrc](https://huggingface.co/rsvp-ai/bertserini-bert-base-cmrc)
\ No newline at end of file

From 17d0fb27bc199a6d37c2a5528ea9cb15ece2ef42 Mon Sep 17 00:00:00 2001
From: xueguang <x93ma@edu.uwaterloo.ca>
Date: Fri, 25 Sep 2020 22:40:20 +0800
Subject: [PATCH 25/50] fix requirements.txt

---
 requirements.txt | 4 ++--
 setup.py         | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 7562f84..e2d3e6f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@ tqdm>=4.45.0
 numpy>=1.18.5
 pyserini==0.9.4.0
 transformers>=2.10.0
-torch==1.5.1+cu101
-torchvision==0.6.1+cu101
+torch==1.5.1
+torchvision==0.6.1
 tensorboardX>=2.1
 hanziconv>=0.3.2
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 8d4862d..c96f5ef 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@
 
 setuptools.setup(
     name='bertserini',
-    version='0.0.1',
+    version='0.0.2',
     packages=['bertserini', 'bertserini.reader', 'bertserini.retriever', 'bertserini.experiments', 'bertserini.utils'],
     url='https://github.com/rsvp-ai/bertserini',
     license='',

From 3c9c53f791dbe8dc9a04f4d6536cd2e080f55d15 Mon Sep 17 00:00:00 2001
From: qguo96 <guogai1996@gmail.com>
Date: Mon, 5 Oct 2020 15:27:15 -0400
Subject: [PATCH 26/50] Take advantage of pyserini's new prebuilt index
 features (#10)

Take advantage of pyserini's new prebuilt index features #8
Co-authored-by: q35guo <q35guo@gpu3.datasci-domain.cs.uwaterloo.ca>
---
 bertserini/retriever/pyserini_retriever.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/bertserini/retriever/pyserini_retriever.py b/bertserini/retriever/pyserini_retriever.py
index 4a6df79..74a0545 100644
--- a/bertserini/retriever/pyserini_retriever.py
+++ b/bertserini/retriever/pyserini_retriever.py
@@ -13,6 +13,11 @@ def build_searcher(index_path, k1=0.9, b=0.4, language="en"):
     searcher.object.setLanguage(language)
     return searcher
 
+def build_searcher_from_prebuilt_index(index_name, k1=0.9, b=0.4, language="en"):
+    searcher = SimpleSearcher.from_prebuilt_index(index_name)
+    searcher.set_bm25(k1, b)
+    searcher.object.setLanguage(language)
+    return searcher
 
 def retriever(question, searcher, para_num=20):
     language = question.language

From 70c878e9bfc4c9087151a62a0c683f0d5246c622 Mon Sep 17 00:00:00 2001
From: qguo96 <guogai1996@gmail.com>
Date: Mon, 5 Oct 2020 15:28:21 -0400
Subject: [PATCH 27/50] fix experiment documents and download punkt corpus
 (#11)

* Update experiments-cmrc.md

* Update experiments-squad.md

* Update evaluate_v1_cmrc.py
---
 bertserini/experiments/eval/evaluate_v1_cmrc.py | 3 +++
 docs/experiments-cmrc.md                        | 4 ++--
 docs/experiments-squad.md                       | 4 ++--
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/bertserini/experiments/eval/evaluate_v1_cmrc.py b/bertserini/experiments/eval/evaluate_v1_cmrc.py
index 0f3ff32..4556d15 100755
--- a/bertserini/experiments/eval/evaluate_v1_cmrc.py
+++ b/bertserini/experiments/eval/evaluate_v1_cmrc.py
@@ -17,6 +17,9 @@
 #from utils import init_logger
 #logger = init_logger("evaluation")
 
+#install punkt corpus
+nltk.download('punkt')
+
 # split Chinese with English
 def mixed_segmentation(in_str, rm_punc=False):
 	in_str = str(in_str).lower().strip()
diff --git a/docs/experiments-cmrc.md b/docs/experiments-cmrc.md
index ab63f13..c551260 100644
--- a/docs/experiments-cmrc.md
+++ b/docs/experiments-cmrc.md
@@ -58,7 +58,7 @@ python -m bertserini.experiments.inference --dataset_path data/cmrc2018_dev_squa
 
 ```
 mkdir temp
-pyhton -m bertserini.experiments.evaluate --eval_data data/cmrc2018_dev.json \
+python -m bertserini.experiments.evaluate --eval_data data/cmrc2018_dev.json \
                                           --search_file prediction/cmrc2018_pred.json \
                                           --output_path temp \
                                           --dataset cmrc
@@ -70,4 +70,4 @@ Expected results:
 ```
 ## rsvp-ai/bertserini-bert-base-cmrc, this is bert-base-chinese finetuned on the chinese reading comprehension dataset(CMRC)
 (0.5, {'f1_score': 68.0033167812909, 'exact_match': 51.164958061509786, 'total_count': 3219, 'skip_count': 1})
-```
\ No newline at end of file
+```
diff --git a/docs/experiments-squad.md b/docs/experiments-squad.md
index 105b67d..65a275b 100644
--- a/docs/experiments-squad.md
+++ b/docs/experiments-squad.md
@@ -63,7 +63,7 @@ python -m bertserini.experiments.inference --dataset_path data/dev-v1.1.json \
 
 ```
 mkdir temp
-pyhton -m bertserini.experiments.evaluate --eval_data data/dev-v1.1.json \
+python -m bertserini.experiments.evaluate --eval_data data/dev-v1.1.json \
                                           --search_file prediction/squad_bert_large_pred.json \
                                           --output_path temp \
                                           --dataset squad
@@ -76,4 +76,4 @@ Expected results:
 
 ## rsvp-ai/bertserini-bert-base-squad, this is finetuned based on bert-base-uncased
 (0.5, {'exact_match': 40.179754020813625, 'f1': 47.828056659017584, 'recall': 49.517951036176, 'precision': 48.3495034100538, 'cover': 45.50614947965941, 'overlap': 56.20624408703879})
-```
\ No newline at end of file
+```

From cb0be105cdadbc8fd3941500f16780a0a469eec2 Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Fri, 4 Feb 2022 16:39:43 +0000
Subject: [PATCH 28/50] runable when update to huggingface 4.5. Possible issue:
 inference super slow

---
 bertserini/experiments/args.py      |   4 +-
 bertserini/reader/bert_reader.py    |  29 +-
 bertserini/utils/utils_squad.py     |   9 +-
 bertserini/utils/utils_squad_new.py | 777 ++++++++++++++++++++++++++++
 test.py                             |  26 +
 5 files changed, 828 insertions(+), 17 deletions(-)
 create mode 100644 bertserini/utils/utils_squad_new.py
 create mode 100644 test.py

diff --git a/bertserini/experiments/args.py b/bertserini/experiments/args.py
index 376790e..e3a9ccb 100644
--- a/bertserini/experiments/args.py
+++ b/bertserini/experiments/args.py
@@ -26,7 +26,7 @@
 )
 parser.add_argument(
     "--tokenizer_name",
-    default="",
+    default=None,
     type=str,
     help="Pretrained tokenizer name or path if not the same as model_name",
 )
@@ -49,4 +49,4 @@
     type=int,
     help="The number of contexts retrieved for a question",
 )
-args = parser.parse_args()
\ No newline at end of file
+args = parser.parse_args()
diff --git a/bertserini/reader/bert_reader.py b/bertserini/reader/bert_reader.py
index 0b30a23..3376d03 100644
--- a/bertserini/reader/bert_reader.py
+++ b/bertserini/reader/bert_reader.py
@@ -1,9 +1,9 @@
 from typing import List
 
-from transformers import AutoTokenizer, AutoModelForQuestionAnswering, squad_convert_examples_to_features
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering #squad_convert_examples_to_features
 from torch.utils.data import DataLoader, SequentialSampler
 import torch
-from transformers.data.processors.squad import SquadResult
+#from transformers.data.processors.squad import SquadResult
 
 from bertserini.reader.base import Reader, Question, Context, Answer
 
@@ -11,8 +11,11 @@
 
 from bertserini.train.run_squad import to_list
 
-from bertserini.utils.utils_squad import SquadExample, compute_predictions_logits
-
+#from bertserini.utils.utils_squad import SquadExample, compute_predictions_logits
+from transformers import SquadExample
+#from transformers.data.metrics.squad_metrics import compute_predictions_logits
+from bertserini.utils.utils_squad_new import compute_predictions_logits
+from transformers.data.processors.squad import squad_convert_examples_to_features, SquadResult
 
 def craft_squad_examples(question: Question, contexts: List[Context]) -> List[SquadExample]:
     examples = []
@@ -27,7 +30,7 @@ def craft_squad_examples(question: Question, contexts: List[Context]) -> List[Sq
                 title="",
                 is_impossible=False,
                 answers=[],
-                language=ctx.language
+                #language=ctx.language
             )
         )
     return examples
@@ -39,7 +42,7 @@ def __init__(self, model_name: str, tokenizer_name: str = None):
             tokenizer_name = model_name
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.model = AutoModelForQuestionAnswering.from_pretrained(model_name).to(self.device).eval()
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=True, use_fast=False)
         self.args = {
             "max_seq_length": 384,
             "doc_stride": 128,
@@ -98,14 +101,16 @@ def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
                 eval_feature = features[feature_index.item()]
                 unique_id = int(eval_feature.unique_id)
 
-                output = [to_list(output[i]) for output in outputs]
+                output = [output[i] for output in outputs]
                 
-                start_logits, end_logits = output
+                #start_logits, end_logits = output
+                start_logits = outputs.start_logits[i]
+                end_logits = outputs.end_logits[i]
                 result = SquadResult(unique_id, start_logits, end_logits)
 
                 all_results.append(result)
 
-        answers, _ = compute_predictions_logits(
+        answers  = compute_predictions_logits(
             all_examples=examples,
             all_features=features,
             all_results=all_results,
@@ -119,14 +124,14 @@ def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
             version_2_with_negative=self.args["version_2_with_negative"],
             null_score_diff_threshold=self.args["null_score_diff_threshold"],
             tokenizer=self.tokenizer,
-            language=question.language
+        #    language=question.language
         )
 
         all_answers = []
         for idx, ans in enumerate(answers):
             all_answers.append(Answer(
-                text=answers[ans][0],
-                score=answers[ans][1],
+                text=answers[ans][0]["text"],
+                score=answers[ans][0]["probability"],
                 ctx_score=contexts[idx].score,
                 language=question.language
             ))
diff --git a/bertserini/utils/utils_squad.py b/bertserini/utils/utils_squad.py
index 7c018ab..3301cdd 100644
--- a/bertserini/utils/utils_squad.py
+++ b/bertserini/utils/utils_squad.py
@@ -15,7 +15,9 @@
 import re
 import string
 
-from transformers.tokenization_bert import BasicTokenizer
+#from transformers.tokenization_bert import BasicTokenizer
+#from transformers.models.bert.tokenization_bert import BasicTokenizer
+from transformers import AutoTokenizer
 
 
 logger = logging.getLogger(__name__)
@@ -313,7 +315,7 @@ def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_
     return evaluation
 
 
-def get_final_text(pred_text, orig_text, do_lower_case, language="zh", verbose_logging=False):
+def get_final_text(pred_text, orig_text, do_lower_case, language="en", verbose_logging=False, tokenizer_name="rsvp-ai/bertserini-bert-base-squad"):
     """Project the tokenized prediction back to the original text."""
 
     # When we created the data, we kept track of the alignment between original
@@ -356,7 +358,8 @@ def _strip_spaces(text):
     # and `pred_text`, and check if they are the same length. If they are
     # NOT the same length, the heuristic has failed. If they are the same
     # length, we assume the characters are one-to-one aligned.
-    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=False)
     if language=="zh":
         tok_text = "".join(tokenizer.tokenize(orig_text))
     else:
diff --git a/bertserini/utils/utils_squad_new.py b/bertserini/utils/utils_squad_new.py
new file mode 100644
index 0000000..19346fa
--- /dev/null
+++ b/bertserini/utils/utils_squad_new.py
@@ -0,0 +1,777 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was modified by XLNet authors to
+update `find_best_threshold` scripts for SQuAD V2.0
+
+In addition to basic functionality, we also compute additional statistics and plot precision-recall curves if an
+additional na_prob.json file is provided. This file is expected to map question ID's to the model's predicted
+probability that a question is unanswerable.
+"""
+
+
+import collections
+import json
+import math
+import re
+import string
+
+#from ...models.bert import BasicTokenizer
+#from ...utils import logging
+from transformers import AutoTokenizer
+
+
+#logger = logging.get_logger(__name__)
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+        return re.sub(regex, " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def get_tokens(s):
+    if not s:
+        return []
+    return normalize_answer(s).split()
+
+
+def compute_exact(a_gold, a_pred):
+    return int(normalize_answer(a_gold) == normalize_answer(a_pred))
+
+
+def compute_f1(a_gold, a_pred):
+    gold_toks = get_tokens(a_gold)
+    pred_toks = get_tokens(a_pred)
+    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+    num_same = sum(common.values())
+    if len(gold_toks) == 0 or len(pred_toks) == 0:
+        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+        return int(gold_toks == pred_toks)
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(pred_toks)
+    recall = 1.0 * num_same / len(gold_toks)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def get_raw_scores(examples, preds):
+    """
+    Computes the exact and f1 scores from the examples and the model predictions
+    """
+    exact_scores = {}
+    f1_scores = {}
+
+    for example in examples:
+        qas_id = example.qas_id
+        gold_answers = [answer["text"] for answer in example.answers if normalize_answer(answer["text"])]
+
+        if not gold_answers:
+            # For unanswerable questions, only correct answer is empty string
+            gold_answers = [""]
+
+        if qas_id not in preds:
+            print(f"Missing prediction for {qas_id}")
+            continue
+
+        prediction = preds[qas_id]
+        exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers)
+        f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers)
+
+    return exact_scores, f1_scores
+
+
+def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
+    new_scores = {}
+    for qid, s in scores.items():
+        pred_na = na_probs[qid] > na_prob_thresh
+        if pred_na:
+            new_scores[qid] = float(not qid_to_has_ans[qid])
+        else:
+            new_scores[qid] = s
+    return new_scores
+
+
+def make_eval_dict(exact_scores, f1_scores, qid_list=None):
+    if not qid_list:
+        total = len(exact_scores)
+        return collections.OrderedDict(
+            [
+                ("exact", 100.0 * sum(exact_scores.values()) / total),
+                ("f1", 100.0 * sum(f1_scores.values()) / total),
+                ("total", total),
+            ]
+        )
+    else:
+        total = len(qid_list)
+        return collections.OrderedDict(
+            [
+                ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total),
+                ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total),
+                ("total", total),
+            ]
+        )
+
+
+def merge_eval(main_eval, new_eval, prefix):
+    for k in new_eval:
+        main_eval[f"{prefix}_{k}"] = new_eval[k]
+
+
+def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
+    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+    cur_score = num_no_ans
+    best_score = cur_score
+    best_thresh = 0.0
+    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+    for i, qid in enumerate(qid_list):
+        if qid not in scores:
+            continue
+        if qid_to_has_ans[qid]:
+            diff = scores[qid]
+        else:
+            if preds[qid]:
+                diff = -1
+            else:
+                diff = 0
+        cur_score += diff
+        if cur_score > best_score:
+            best_score = cur_score
+            best_thresh = na_probs[qid]
+
+    has_ans_score, has_ans_cnt = 0, 0
+    for qid in qid_list:
+        if not qid_to_has_ans[qid]:
+            continue
+        has_ans_cnt += 1
+
+        if qid not in scores:
+            continue
+        has_ans_score += scores[qid]
+
+    return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
+
+
+def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+    best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
+    best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
+    main_eval["best_exact"] = best_exact
+    main_eval["best_exact_thresh"] = exact_thresh
+    main_eval["best_f1"] = best_f1
+    main_eval["best_f1_thresh"] = f1_thresh
+    main_eval["has_ans_exact"] = has_ans_exact
+    main_eval["has_ans_f1"] = has_ans_f1
+
+
+def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
+    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+    cur_score = num_no_ans
+    best_score = cur_score
+    best_thresh = 0.0
+    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+    for _, qid in enumerate(qid_list):
+        if qid not in scores:
+            continue
+        if qid_to_has_ans[qid]:
+            diff = scores[qid]
+        else:
+            if preds[qid]:
+                diff = -1
+            else:
+                diff = 0
+        cur_score += diff
+        if cur_score > best_score:
+            best_score = cur_score
+            best_thresh = na_probs[qid]
+    return 100.0 * best_score / len(scores), best_thresh
+
+
+def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+    best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
+    best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
+
+    main_eval["best_exact"] = best_exact
+    main_eval["best_exact_thresh"] = exact_thresh
+    main_eval["best_f1"] = best_f1
+    main_eval["best_f1_thresh"] = f1_thresh
+
+
+def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0):
+    qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples}
+    has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer]
+    no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]
+
+    if no_answer_probs is None:
+        no_answer_probs = {k: 0.0 for k in preds}
+
+    exact, f1 = get_raw_scores(examples, preds)
+
+    exact_threshold = apply_no_ans_threshold(
+        exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold
+    )
+    f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
+
+    evaluation = make_eval_dict(exact_threshold, f1_threshold)
+
+    if has_answer_qids:
+        has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids)
+        merge_eval(evaluation, has_ans_eval, "HasAns")
+
+    if no_answer_qids:
+        no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids)
+        merge_eval(evaluation, no_ans_eval, "NoAns")
+
+    if no_answer_probs:
+        find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer)
+
+    return evaluation
+
+
+def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heuristic between
+    # `pred_text` and `orig_text` to get a character-to-character alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    #tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="rsvp-ai/bertserini-bert-base-squad", use_fast=False)
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        #if verbose_logging:
+            #logger.info(f"Unable to find text: '{pred_text}' in '{orig_text}'")
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        #if verbose_logging:
+            #logger.info(f"Length not equal after stripping spaces: '{orig_ns_text}' vs '{tok_ns_text}'")
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in tok_ns_to_s_map.items():
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        #if verbose_logging:
+        #    logger.info("Couldn't map start position")
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        #if verbose_logging:
+        #    logger.info("Couldn't map end position")
+        return orig_text
+
+    output_text = orig_text[orig_start_position : (orig_end_position + 1)]
+    return output_text
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
+
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
+
+
+def compute_predictions_logits(
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    do_lower_case,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    verbose_logging,
+    version_2_with_negative,
+    null_score_diff_threshold,
+    tokenizer,
+):
+    """Write final predictions to the json file and log-odds of null if needed."""
+    #if output_prediction_file:
+    #    logger.info(f"Writing predictions to: {output_prediction_file}")
+    #if output_nbest_file:
+    #    logger.info(f"Writing nbest to: {output_nbest_file}")
+    #if output_null_log_odds_file and version_2_with_negative:
+    #    logger.info(f"Writing null_log_odds to: {output_null_log_odds_file}")
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
+    )
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+        min_null_feature_index = 0  # the paragraph slice with min null score
+        null_start_logit = 0  # the start logit at the slice with min null score
+        null_end_logit = 0  # the end logit at the slice with min null score
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            # if we could have irrelevant answers, get the min score of irrelevant
+            if version_2_with_negative:
+                feature_null_score = result.start_logits[0] + result.end_logits[0]
+                if feature_null_score < score_null:
+                    score_null = feature_null_score
+                    min_null_feature_index = feature_index
+                    null_start_logit = result.start_logits[0]
+                    null_end_logit = result.end_logits[0]
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index],
+                        )
+                    )
+        if version_2_with_negative:
+            prelim_predictions.append(
+                _PrelimPrediction(
+                    feature_index=min_null_feature_index,
+                    start_index=0,
+                    end_index=0,
+                    start_logit=null_start_logit,
+                    end_logit=null_end_logit,
+                )
+            )
+        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
+
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"]
+        )
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            if pred.start_index > 0:  # this is a non-null prediction
+                tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
+                orig_doc_start = feature.token_to_orig_map[pred.start_index]
+                orig_doc_end = feature.token_to_orig_map[pred.end_index]
+                orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
+
+                tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
+
+                # tok_text = " ".join(tok_tokens)
+                #
+                # # De-tokenize WordPieces that have been split off.
+                # tok_text = tok_text.replace(" ##", "")
+                # tok_text = tok_text.replace("##", "")
+
+                # Clean whitespace
+                tok_text = tok_text.strip()
+                tok_text = " ".join(tok_text.split())
+                orig_text = " ".join(orig_tokens)
+
+                final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
+                if final_text in seen_predictions:
+                    continue
+
+                seen_predictions[final_text] = True
+            else:
+                final_text = ""
+                seen_predictions[final_text] = True
+
+            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
+        # if we didn't include the empty option in the n-best, include it
+        if version_2_with_negative:
+            if "" not in seen_predictions:
+                nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
+
+            # In very rare edge cases we could only have single null prediction.
+            # So we just create a nonce prediction in this case to avoid failure.
+            if len(nbest) == 1:
+                nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        assert len(nbest) >= 1, "No valid predictions"
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry:
+                if entry.text:
+                    best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = entry.start_logit
+            output["end_logit"] = entry.end_logit
+            nbest_json.append(output)
+
+        assert len(nbest_json) >= 1, "No valid predictions"
+
+        if not version_2_with_negative:
+            all_predictions[example.qas_id] = nbest_json[0]["text"]
+        else:
+            # predict "" iff the null score - the score of best non-null > threshold
+            score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
+            scores_diff_json[example.qas_id] = score_diff
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example.qas_id] = ""
+            else:
+                all_predictions[example.qas_id] = best_non_null_entry.text
+        all_nbest_json[example.qas_id] = nbest_json
+
+    if output_prediction_file:
+        with open(output_prediction_file, "w") as writer:
+            writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    if output_nbest_file:
+        with open(output_nbest_file, "w") as writer:
+            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if output_null_log_odds_file and version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_nbest_json
+
+
+def compute_predictions_log_probs(
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    start_n_top,
+    end_n_top,
+    version_2_with_negative,
+    tokenizer,
+    verbose_logging,
+):
+    """
+    XLNet write prediction logic (more complex than Bert's). Write final predictions to the json file and log-odds of
+    null if needed.
+
+    Requires utils_squad_evaluate.py
+    """
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]
+    )
+
+    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]
+    )
+
+    #logger.info(f"Writing predictions to: {output_prediction_file}")
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+
+            cur_null_score = result.cls_logits
+
+            # if we could have irrelevant answers, get the min score of irrelevant
+            score_null = min(score_null, cur_null_score)
+
+            for i in range(start_n_top):
+                for j in range(end_n_top):
+                    start_log_prob = result.start_logits[i]
+                    start_index = result.start_top_index[i]
+
+                    j_index = i * end_n_top + j
+
+                    end_log_prob = result.end_logits[j_index]
+                    end_index = result.end_top_index[j_index]
+
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= feature.paragraph_len - 1:
+                        continue
+                    if end_index >= feature.paragraph_len - 1:
+                        continue
+
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_log_prob=start_log_prob,
+                            end_log_prob=end_log_prob,
+                        )
+                    )
+
+        prelim_predictions = sorted(
+            prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), reverse=True
+        )
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+
+            # XLNet un-tokenizer
+            # Let's keep it simple for now and see if we need all this later.
+            #
+            # tok_start_to_orig_index = feature.tok_start_to_orig_index
+            # tok_end_to_orig_index = feature.tok_end_to_orig_index
+            # start_orig_pos = tok_start_to_orig_index[pred.start_index]
+            # end_orig_pos = tok_end_to_orig_index[pred.end_index]
+            # paragraph_text = example.paragraph_text
+            # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
+
+            # Previously used Bert untokenizer
+            tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
+            orig_doc_start = feature.token_to_orig_map[pred.start_index]
+            orig_doc_end = feature.token_to_orig_map[pred.end_index]
+            orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
+            tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
+
+            # Clean whitespace
+            tok_text = tok_text.strip()
+            tok_text = " ".join(tok_text.split())
+            orig_text = " ".join(orig_tokens)
+
+            if hasattr(tokenizer, "do_lower_case"):
+                do_lower_case = tokenizer.do_lower_case
+            else:
+                do_lower_case = tokenizer.do_lowercase_and_remove_accent
+
+            final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
+
+            if final_text in seen_predictions:
+                continue
+
+            seen_predictions[final_text] = True
+
+            nbest.append(
+                _NbestPrediction(text=final_text, start_log_prob=pred.start_log_prob, end_log_prob=pred.end_log_prob)
+            )
+
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(_NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6))
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_log_prob + entry.end_log_prob)
+            if not best_non_null_entry:
+                best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_log_prob"] = entry.start_log_prob
+            output["end_log_prob"] = entry.end_log_prob
+            nbest_json.append(output)
+
+        assert len(nbest_json) >= 1, "No valid predictions"
+        assert best_non_null_entry is not None, "No valid predictions"
+
+        score_diff = score_null
+        scores_diff_json[example.qas_id] = score_diff
+        # note(zhiliny): always predict best_non_null_entry
+        # and the evaluation script will search for the best threshold
+        all_predictions[example.qas_id] = best_non_null_entry.text
+
+        all_nbest_json[example.qas_id] = nbest_json
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..c2c3c35
--- /dev/null
+++ b/test.py
@@ -0,0 +1,26 @@
+
+from bertserini.reader.base import Question, Context
+from bertserini.reader.bert_reader import BERT
+from bertserini.utils.utils_new import get_best_answer
+
+model_name = "rsvp-ai/bertserini-bert-base-squad"
+tokenizer_name = "rsvp-ai/bertserini-bert-base-squad"
+bert_reader = BERT(model_name, tokenizer_name)
+
+question = Question("Why did Mark Twain call the 19th century the glied age?")
+
+contexts = [Context('The "Gilded Age" was a term that Mark Twain used to describe the period of the late 19th century when there had been a dramatic expansion of American wealth and prosperity.')]
+
+candidates = bert_reader.predict(question, contexts)
+answer = get_best_answer(candidates, 0.45)
+print(answer.text)
+print("local context passed")
+
+from bertserini.retriever.pyserini_retriever import retriever, build_searcher
+searcher = build_searcher("indexes/lucene-index.enwiki-20180701")
+contexts = retriever(question, searcher, 10)
+candidates = bert_reader.predict(question, contexts)
+answer = get_best_answer(candidates, 0.45)
+print(answer.text)
+print("e2e context passed")
+

From 835ab58dbdd69d248553b4ca2cd7b0f4b00e57fe Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Thu, 3 Mar 2022 04:26:16 +0000
Subject: [PATCH 29/50] runable squad inference+eval. Changes: fix transformers
 version, update index path, minor error fix and readme update

---
 README.md                      | 3 ++-
 bertserini/experiments/args.py | 4 ++--
 docs/experiments-squad.md      | 5 +++--
 requirements.txt               | 5 +++--
 4 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 5b42268..df17789 100644
--- a/README.md
+++ b/README.md
@@ -62,7 +62,8 @@ NOTE:
 
  The index we used above is English Wikipedia, which could be download via:
 ```
-wget ftp://72.143.107.253/BERTserini/english_wiki_2018_index.zip
+#wget ftp://72.143.107.253/BERTserini/english_wiki_2018_index.zip (deprecated)
+wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/lucene-index.enwiki-20180701-paragraphs.tar.gz
 ```
 
 After unzipping these file, we suggest you putting it in `indexes/`.
diff --git a/bertserini/experiments/args.py b/bertserini/experiments/args.py
index 376790e..e3a9ccb 100644
--- a/bertserini/experiments/args.py
+++ b/bertserini/experiments/args.py
@@ -26,7 +26,7 @@
 )
 parser.add_argument(
     "--tokenizer_name",
-    default="",
+    default=None,
     type=str,
     help="Pretrained tokenizer name or path if not the same as model_name",
 )
@@ -49,4 +49,4 @@
     type=int,
     help="The number of contexts retrieved for a question",
 )
-args = parser.parse_args()
\ No newline at end of file
+args = parser.parse_args()
diff --git a/docs/experiments-squad.md b/docs/experiments-squad.md
index 65a275b..c1a7fad 100644
--- a/docs/experiments-squad.md
+++ b/docs/experiments-squad.md
@@ -45,7 +45,7 @@ For `rsvp-ai/bertserini-bert-base-squad`
 python -m bertserini.experiments.inference --dataset_path data/dev-v1.1.json \
                                            --index_path indexes/lucene-index.enwiki-20180701-paragraphs \
                                            --model_name_or_path rsvp-ai/bertserini-bert-base-squad \
-                                           --output squad_bert_base_pred.json
+                                           --output squad_bert_base_pred.json \
                                            --topk 10
 
 ```
@@ -55,7 +55,7 @@ For `rsvp-ai/bertserini-bert-large-squad`
 python -m bertserini.experiments.inference --dataset_path data/dev-v1.1.json \
                                            --index_path indexes/lucene-index.enwiki-20180701-paragraphs \
                                            --model_name_or_path rsvp-ai/bertserini-bert-large-squad \
-                                           --output prediction/squad_bert_large_pred.json
+                                           --output prediction/squad_bert_large_pred.json \
                                            --topk 10
 
 ```
@@ -76,4 +76,5 @@ Expected results:
 
 ## rsvp-ai/bertserini-bert-base-squad, this is finetuned based on bert-base-uncased
 (0.5, {'exact_match': 40.179754020813625, 'f1': 47.828056659017584, 'recall': 49.517951036176, 'precision': 48.3495034100538, 'cover': 45.50614947965941, 'overlap': 56.20624408703879})
+Mar 02, 2022 reproduce by @amyxie361, (0.5, {'exact_match': 40.22705771050142, 'f1': 47.82830395639832, 'recall': 49.52073687640621, 'precision': 48.336237855777995, 'cover': 45.572374645222325, 'overlap': 56.12109744560075})
 ```
diff --git a/requirements.txt b/requirements.txt
index e2d3e6f..59bd6b6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,9 @@
 tqdm>=4.45.0
 numpy>=1.18.5
 pyserini==0.9.4.0
-transformers>=2.10.0
+transformers==3.4.0
 torch==1.5.1
 torchvision==0.6.1
 tensorboardX>=2.1
-hanziconv>=0.3.2
\ No newline at end of file
+hanziconv>=0.3.2
+nltk

From 618ad874133cab7d85d66c4c976c17f493656bcd Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Sat, 5 Mar 2022 01:30:38 +0000
Subject: [PATCH 30/50] revise version

---
 bertserini/experiments/inference.py        |  3 +++
 bertserini/reader/base.py                  | 18 ++++++++++++++++++
 bertserini/reader/bert_reader.py           |  8 ++++----
 bertserini/retriever/pyserini_retriever.py | 14 ++++++++++----
 4 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/bertserini/experiments/inference.py b/bertserini/experiments/inference.py
index de6d042..8f86961 100644
--- a/bertserini/experiments/inference.py
+++ b/bertserini/experiments/inference.py
@@ -12,9 +12,12 @@
 
     all_answer = []
     for question in tqdm(questions):
+        print("Start retrieve")
         contexts = retriever(question, searcher, args.topk)
+        print("Start read")
         final_answers = bert_reader.predict(question, contexts)
         final_answers_lst = []
+        print("start rerank")
         for ans in final_answers:
             final_answers_lst.append(
                 {"id": question.id,
diff --git a/bertserini/reader/base.py b/bertserini/reader/base.py
index 75f080e..85dd442 100644
--- a/bertserini/reader/base.py
+++ b/bertserini/reader/base.py
@@ -23,6 +23,12 @@ def __init__(self, text: str, id: Optional[str] = None, language: str = "en"):
         self.id = id
         self.language = language
 
+    def __repr__(self):
+        return str(self)
+
+    def __str__(self):
+        return "<Question:{}>".format(self.text)
+
 
 class Context:
     """
@@ -53,6 +59,12 @@ def __init__(self,
         self.metadata = metadata
         self.score = score
 
+    def __repr__(self):
+        return str(self)
+
+    def __str__(self):
+        return "<Passage:{},\n score:{}>".format(self.text, self.score)
+
 
 class Answer:
     """
@@ -87,6 +99,12 @@ def __init__(self,
         self.ctx_score = ctx_score
         self.total_score = total_score
 
+    def __repr__(self):
+        return str(self)
+
+    def __str__(self):
+        return "<Answer: {}, score:{}, ctx_score:{}, total_score:{}>".format(self.text, self.score, self.ctx_score, self.total_score)
+
     def aggregate_score(self, weight):
         self.total_score = weight*self.score + (1-weight)*self.ctx_score
 
diff --git a/bertserini/reader/bert_reader.py b/bertserini/reader/bert_reader.py
index 3376d03..ec52ec4 100644
--- a/bertserini/reader/bert_reader.py
+++ b/bertserini/reader/bert_reader.py
@@ -13,8 +13,8 @@
 
 #from bertserini.utils.utils_squad import SquadExample, compute_predictions_logits
 from transformers import SquadExample
-#from transformers.data.metrics.squad_metrics import compute_predictions_logits
-from bertserini.utils.utils_squad_new import compute_predictions_logits
+from transformers.data.metrics.squad_metrics import compute_predictions_logits
+#from bertserini.utils.utils_squad_new import compute_predictions_logits
 from transformers.data.processors.squad import squad_convert_examples_to_features, SquadResult
 
 def craft_squad_examples(question: Question, contexts: List[Context]) -> List[SquadExample]:
@@ -101,7 +101,7 @@ def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
                 eval_feature = features[feature_index.item()]
                 unique_id = int(eval_feature.unique_id)
 
-                output = [output[i] for output in outputs]
+                output = [outputs[oname][i] for oname in outputs]
                 
                 #start_logits, end_logits = output
                 start_logits = outputs.start_logits[i]
@@ -110,7 +110,7 @@ def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
 
                 all_results.append(result)
 
-        answers  = compute_predictions_logits(
+        answers = compute_predictions_logits(
             all_examples=examples,
             all_features=features,
             all_results=all_results,
diff --git a/bertserini/retriever/pyserini_retriever.py b/bertserini/retriever/pyserini_retriever.py
index 74a0545..7185085 100644
--- a/bertserini/retriever/pyserini_retriever.py
+++ b/bertserini/retriever/pyserini_retriever.py
@@ -1,6 +1,8 @@
 from typing import List
+import json
 
-from pyserini.search import SimpleSearcher, JSimpleSearcherResult
+#from pyserini.search import SimpleSearcher, JSimpleSearcherResult
+from pyserini.search.lucene import LuceneSearcher, JLuceneSearcherResult
 from bertserini.utils.utils import init_logger
 from bertserini.reader.base import Context
 
@@ -8,13 +10,15 @@
 
 
 def build_searcher(index_path, k1=0.9, b=0.4, language="en"):
-    searcher = SimpleSearcher(index_path)
+    #searcher = SimpleSearcher(index_path)
+    searcher = LuceneSearcher(index_path)
     searcher.set_bm25(k1, b)
     searcher.object.setLanguage(language)
     return searcher
 
 def build_searcher_from_prebuilt_index(index_name, k1=0.9, b=0.4, language="en"):
-    searcher = SimpleSearcher.from_prebuilt_index(index_name)
+    #searcher = SimpleSearcher.from_prebuilt_index(index_name)
+    searcher = LuceneSearcher.from_prebuilt_index(index_name)
     searcher.set_bm25(k1, b)
     searcher.object.setLanguage(language)
     return searcher
@@ -32,7 +36,8 @@ def retriever(question, searcher, para_num=20):
     return hits_to_contexts(hits, language)
 
 
-def hits_to_contexts(hits: List[JSimpleSearcherResult], language="en", field='raw', blacklist=[]) -> List[Context]:
+def hits_to_contexts(hits: List[JLuceneSearcherResult], #List[JSimpleSearcherResult]
+    language="en", field="raw", blacklist=[]) -> List[Context]:
     """
         Converts hits from Pyserini into a list of texts.
         Parameters
@@ -53,6 +58,7 @@ def hits_to_contexts(hits: List[JSimpleSearcherResult], language="en", field='ra
     contexts = []
     for i in range(0, len(hits)):
         t = hits[i].raw if field == 'raw' else hits[i].contents
+        t = json.loads(t)["contents"]
         for s in blacklist:
             if s in t:
                 continue

From d9d15ab849fd953f64425805037900aec4168e16 Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Fri, 11 Mar 2022 00:01:10 +0000
Subject: [PATCH 31/50] runable chinese

---
 .gitignore                                 |   3 +
 bertserini/experiments/args.py             |   6 +
 bertserini/experiments/eval/evaluate_v1.py |   9 +-
 bertserini/experiments/inference.py        |   5 +-
 bertserini/reader/bert_reader.py           |  19 +-
 bertserini/retriever/pyserini_retriever.py |  10 +-
 bertserini/utils/utils_squad_metrics.py    | 783 +++++++++++++++++++++
 docs/experiments-cmrc.md                   |  13 +-
 8 files changed, 821 insertions(+), 27 deletions(-)
 create mode 100644 bertserini/utils/utils_squad_metrics.py

diff --git a/.gitignore b/.gitignore
index 06d575f..971a53d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 cache*
 *.log
 *.json
+*.sh
 
 .idea/
 
@@ -8,3 +9,5 @@ build/
 dist/
 
 bertserini.egg-info/
+
+indexes/
diff --git a/bertserini/experiments/args.py b/bertserini/experiments/args.py
index e3a9ccb..abc389f 100644
--- a/bertserini/experiments/args.py
+++ b/bertserini/experiments/args.py
@@ -37,6 +37,12 @@
     required=True,
     help="The output file where the runs results will be written to",
 )
+parser.add_argument(
+    "--output_nbest_file",
+    default="./tmp.nbest", 
+    type=str,
+    help="The output file for store nbest results temporarily",
+)
 parser.add_argument(
     "--language",
     default="en",
diff --git a/bertserini/experiments/eval/evaluate_v1.py b/bertserini/experiments/eval/evaluate_v1.py
index b853597..e9ef60a 100755
--- a/bertserini/experiments/eval/evaluate_v1.py
+++ b/bertserini/experiments/eval/evaluate_v1.py
@@ -98,8 +98,7 @@ def evaluate(dataset, predictions):
             for qa in paragraph['qas']:
                 total += 1
                 if qa['id'] not in predictions:
-                    message = 'Unanswered question ' + qa['id'] + \
-                              ' will receive score 0.'
+                    message = 'Unanswered question ' + str(qa['id']) + ' will receive score 0.'
                     logger.error(message)
                     continue
                 ground_truths = list(map(lambda x: x['text'], qa['answers']))
@@ -137,9 +136,9 @@ def squad_v1_eval(dataset_filename, prediction_filename):
     expected_version = '1.1'
     with open(dataset_filename) as dataset_file:
         dataset_json = json.load(dataset_file)
-        if dataset_json['version'] != expected_version:
-            logger.error('Evaluation expects v-{}, but got dataset with v-{}'.format(
-                expected_version, dataset_json['version']))
+        #if dataset_json['version'] != expected_version:
+        #    logger.error('Evaluation expects v-{}, but got dataset with v-{}'.format(
+        #        expected_version, dataset_json['version']))
         dataset = dataset_json['data']
     with open(prediction_filename) as prediction_file:
         predictions = json.load(prediction_file)
diff --git a/bertserini/experiments/inference.py b/bertserini/experiments/inference.py
index 8f86961..f490ae2 100644
--- a/bertserini/experiments/inference.py
+++ b/bertserini/experiments/inference.py
@@ -7,17 +7,14 @@
 
 if __name__ == "__main__":
     questions = extract_squad_questions(args.dataset_path)
-    bert_reader = BERT(args.model_name_or_path, args.tokenizer_name)
+    bert_reader = BERT(args.model_name_or_path, args.tokenizer_name, args.output_nbest_file)
     searcher = build_searcher(args.index_path, language=args.language)
 
     all_answer = []
     for question in tqdm(questions):
-        print("Start retrieve")
         contexts = retriever(question, searcher, args.topk)
-        print("Start read")
         final_answers = bert_reader.predict(question, contexts)
         final_answers_lst = []
-        print("start rerank")
         for ans in final_answers:
             final_answers_lst.append(
                 {"id": question.id,
diff --git a/bertserini/reader/bert_reader.py b/bertserini/reader/bert_reader.py
index ec52ec4..1c63601 100644
--- a/bertserini/reader/bert_reader.py
+++ b/bertserini/reader/bert_reader.py
@@ -1,4 +1,5 @@
 from typing import List
+import json
 
 from transformers import AutoTokenizer, AutoModelForQuestionAnswering #squad_convert_examples_to_features
 from torch.utils.data import DataLoader, SequentialSampler
@@ -11,10 +12,8 @@
 
 from bertserini.train.run_squad import to_list
 
-#from bertserini.utils.utils_squad import SquadExample, compute_predictions_logits
 from transformers import SquadExample
-from transformers.data.metrics.squad_metrics import compute_predictions_logits
-#from bertserini.utils.utils_squad_new import compute_predictions_logits
+from bertserini.utils.utils_squad_metrics import compute_predictions_logits
 from transformers.data.processors.squad import squad_convert_examples_to_features, SquadResult
 
 def craft_squad_examples(question: Question, contexts: List[Context]) -> List[SquadExample]:
@@ -37,7 +36,7 @@ def craft_squad_examples(question: Question, contexts: List[Context]) -> List[Sq
 
 
 class BERT(Reader):
-    def __init__(self, model_name: str, tokenizer_name: str = None):
+    def __init__(self, model_name: str, tokenizer_name: str = None, output_nbest_file=None):
         if tokenizer_name is None:
             tokenizer_name = model_name
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -53,7 +52,7 @@ def __init__(self, model_name: str, tokenizer_name: str = None):
             "max_answer_length": 30,
             "do_lower_case": True,
             "output_prediction_file": False,
-            "output_nbest_file": None,
+            "output_nbest_file": output_nbest_file,
             "output_null_log_odds_file": None,
             "verbose_logging": False,
             "version_2_with_negative": True,
@@ -110,7 +109,7 @@ def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
 
                 all_results.append(result)
 
-        answers = compute_predictions_logits(
+        answers, nbest = compute_predictions_logits(
             all_examples=examples,
             all_features=features,
             all_results=all_results,
@@ -124,14 +123,14 @@ def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
             version_2_with_negative=self.args["version_2_with_negative"],
             null_score_diff_threshold=self.args["null_score_diff_threshold"],
             tokenizer=self.tokenizer,
-        #    language=question.language
         )
+        #nbest = json.load(open(self.args["output_nbest_file"],'r'))
 
         all_answers = []
-        for idx, ans in enumerate(answers):
+        for idx, ans in enumerate(nbest):
             all_answers.append(Answer(
-                text=answers[ans][0]["text"],
-                score=answers[ans][0]["probability"],
+                text=nbest[ans][0]["text"],
+                score=nbest[ans][0]["start_logit"] + nbest[ans][0]["end_logit"],
                 ctx_score=contexts[idx].score,
                 language=question.language
             ))
diff --git a/bertserini/retriever/pyserini_retriever.py b/bertserini/retriever/pyserini_retriever.py
index 7185085..f69c0aa 100644
--- a/bertserini/retriever/pyserini_retriever.py
+++ b/bertserini/retriever/pyserini_retriever.py
@@ -57,8 +57,14 @@ def hits_to_contexts(hits: List[JLuceneSearcherResult], #List[JSimpleSearcherRes
      """
     contexts = []
     for i in range(0, len(hits)):
-        t = hits[i].raw if field == 'raw' else hits[i].contents
-        t = json.loads(t)["contents"]
+        if field == 'raw':
+            t = hits[i].raw
+        else:
+            t = hits[i].contents
+        try: 
+            t = json.loads(t)["contents"]
+        except:
+            pass
         for s in blacklist:
             if s in t:
                 continue
diff --git a/bertserini/utils/utils_squad_metrics.py b/bertserini/utils/utils_squad_metrics.py
new file mode 100644
index 0000000..1808c99
--- /dev/null
+++ b/bertserini/utils/utils_squad_metrics.py
@@ -0,0 +1,783 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was modified by XLNet authors to
+update `find_best_threshold` scripts for SQuAD V2.0
+
+In addition to basic functionality, we also compute additional statistics and plot precision-recall curves if an
+additional na_prob.json file is provided. This file is expected to map question ID's to the model's predicted
+probability that a question is unanswerable.
+"""
+
+
+import collections
+import json
+import math
+import re
+import string
+
+#from ...models.bert import BasicTokenizer
+#from ...utils import logging
+from transformers.models.bert import BasicTokenizer
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+        return re.sub(regex, " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def get_tokens(s):
+    if not s:
+        return []
+    return normalize_answer(s).split()
+
+
+def compute_exact(a_gold, a_pred):
+    return int(normalize_answer(a_gold) == normalize_answer(a_pred))
+
+
+def compute_f1(a_gold, a_pred):
+    gold_toks = get_tokens(a_gold)
+    pred_toks = get_tokens(a_pred)
+    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+    num_same = sum(common.values())
+    if len(gold_toks) == 0 or len(pred_toks) == 0:
+        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+        return int(gold_toks == pred_toks)
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(pred_toks)
+    recall = 1.0 * num_same / len(gold_toks)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def get_raw_scores(examples, preds):
+    """
+    Computes the exact and f1 scores from the examples and the model predictions
+    """
+    exact_scores = {}
+    f1_scores = {}
+
+    for example in examples:
+        qas_id = example.qas_id
+        gold_answers = [answer["text"] for answer in example.answers if normalize_answer(answer["text"])]
+
+        if not gold_answers:
+            # For unanswerable questions, only correct answer is empty string
+            gold_answers = [""]
+
+        if qas_id not in preds:
+            print(f"Missing prediction for {qas_id}")
+            continue
+
+        prediction = preds[qas_id]
+        exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers)
+        f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers)
+
+    return exact_scores, f1_scores
+
+
+def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
+    new_scores = {}
+    for qid, s in scores.items():
+        pred_na = na_probs[qid] > na_prob_thresh
+        if pred_na:
+            new_scores[qid] = float(not qid_to_has_ans[qid])
+        else:
+            new_scores[qid] = s
+    return new_scores
+
+
+def make_eval_dict(exact_scores, f1_scores, qid_list=None):
+    if not qid_list:
+        total = len(exact_scores)
+        return collections.OrderedDict(
+            [
+                ("exact", 100.0 * sum(exact_scores.values()) / total),
+                ("f1", 100.0 * sum(f1_scores.values()) / total),
+                ("total", total),
+            ]
+        )
+    else:
+        total = len(qid_list)
+        return collections.OrderedDict(
+            [
+                ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total),
+                ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total),
+                ("total", total),
+            ]
+        )
+
+
+def merge_eval(main_eval, new_eval, prefix):
+    for k in new_eval:
+        main_eval[f"{prefix}_{k}"] = new_eval[k]
+
+
+def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
+    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+    cur_score = num_no_ans
+    best_score = cur_score
+    best_thresh = 0.0
+    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+    for i, qid in enumerate(qid_list):
+        if qid not in scores:
+            continue
+        if qid_to_has_ans[qid]:
+            diff = scores[qid]
+        else:
+            if preds[qid]:
+                diff = -1
+            else:
+                diff = 0
+        cur_score += diff
+        if cur_score > best_score:
+            best_score = cur_score
+            best_thresh = na_probs[qid]
+
+    has_ans_score, has_ans_cnt = 0, 0
+    for qid in qid_list:
+        if not qid_to_has_ans[qid]:
+            continue
+        has_ans_cnt += 1
+
+        if qid not in scores:
+            continue
+        has_ans_score += scores[qid]
+
+    return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
+
+
+def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+    best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
+    best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
+    main_eval["best_exact"] = best_exact
+    main_eval["best_exact_thresh"] = exact_thresh
+    main_eval["best_f1"] = best_f1
+    main_eval["best_f1_thresh"] = f1_thresh
+    main_eval["has_ans_exact"] = has_ans_exact
+    main_eval["has_ans_f1"] = has_ans_f1
+
+
+def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
+    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+    cur_score = num_no_ans
+    best_score = cur_score
+    best_thresh = 0.0
+    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+    for _, qid in enumerate(qid_list):
+        if qid not in scores:
+            continue
+        if qid_to_has_ans[qid]:
+            diff = scores[qid]
+        else:
+            if preds[qid]:
+                diff = -1
+            else:
+                diff = 0
+        cur_score += diff
+        if cur_score > best_score:
+            best_score = cur_score
+            best_thresh = na_probs[qid]
+    return 100.0 * best_score / len(scores), best_thresh
+
+
+def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+    best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
+    best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
+
+    main_eval["best_exact"] = best_exact
+    main_eval["best_exact_thresh"] = exact_thresh
+    main_eval["best_f1"] = best_f1
+    main_eval["best_f1_thresh"] = f1_thresh
+
+
+def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0):
+    qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples}
+    has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer]
+    no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]
+
+    if no_answer_probs is None:
+        no_answer_probs = {k: 0.0 for k in preds}
+
+    exact, f1 = get_raw_scores(examples, preds)
+
+    exact_threshold = apply_no_ans_threshold(
+        exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold
+    )
+    f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
+
+    evaluation = make_eval_dict(exact_threshold, f1_threshold)
+
+    if has_answer_qids:
+        has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids)
+        merge_eval(evaluation, has_ans_eval, "HasAns")
+
+    if no_answer_qids:
+        no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids)
+        merge_eval(evaluation, no_ans_eval, "NoAns")
+
+    if no_answer_probs:
+        find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer)
+
+    return evaluation
+
+
+def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heuristic between
+    # `pred_text` and `orig_text` to get a character-to-character alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        if verbose_logging:
+            logger.info(f"Unable to find text: '{pred_text}' in '{orig_text}'")
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        if verbose_logging:
+            logger.info(f"Length not equal after stripping spaces: '{orig_ns_text}' vs '{tok_ns_text}'")
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in tok_ns_to_s_map.items():
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map start position")
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map end position")
+        return orig_text
+
+    output_text = orig_text[orig_start_position : (orig_end_position + 1)]
+    return output_text
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
+
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
+
+
+def compute_predictions_logits(
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    do_lower_case,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    verbose_logging,
+    version_2_with_negative,
+    null_score_diff_threshold,
+    tokenizer,
+):
+    """Write final predictions to the json file and log-odds of null if needed."""
+    if output_prediction_file:
+        logger.info(f"Writing predictions to: {output_prediction_file}")
+    if output_nbest_file:
+        logger.info(f"Writing nbest to: {output_nbest_file}")
+    if output_null_log_odds_file and version_2_with_negative:
+        logger.info(f"Writing null_log_odds to: {output_null_log_odds_file}")
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
+    )
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+        min_null_feature_index = 0  # the paragraph slice with min null score
+        null_start_logit = 0  # the start logit at the slice with min null score
+        null_end_logit = 0  # the end logit at the slice with min null score
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            # if we could have irrelevant answers, get the min score of irrelevant
+            if version_2_with_negative:
+                feature_null_score = result.start_logits[0] + result.end_logits[0]
+                if feature_null_score < score_null:
+                    score_null = feature_null_score
+                    min_null_feature_index = feature_index
+                    null_start_logit = result.start_logits[0]
+                    null_end_logit = result.end_logits[0]
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index],
+                        )
+                    )
+        if version_2_with_negative:
+            prelim_predictions.append(
+                _PrelimPrediction(
+                    feature_index=min_null_feature_index,
+                    start_index=0,
+                    end_index=0,
+                    start_logit=null_start_logit,
+                    end_logit=null_end_logit,
+                )
+            )
+        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
+
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"]
+        )
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            if pred.start_index > 0:  # this is a non-null prediction
+                tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
+                orig_doc_start = feature.token_to_orig_map[pred.start_index]
+                orig_doc_end = feature.token_to_orig_map[pred.end_index]
+                orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
+
+                tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
+
+                # tok_text = " ".join(tok_tokens)
+                #
+                # # De-tokenize WordPieces that have been split off.
+                # tok_text = tok_text.replace(" ##", "")
+                # tok_text = tok_text.replace("##", "")
+
+                # Clean whitespace
+                tok_text = tok_text.strip()
+                tok_text = " ".join(tok_text.split())
+                orig_text = " ".join(orig_tokens)
+
+                final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
+                if final_text in seen_predictions:
+                    continue
+
+                seen_predictions[final_text] = True
+            else:
+                final_text = ""
+                seen_predictions[final_text] = True
+
+            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
+        # if we didn't include the empty option in the n-best, include it
+        if version_2_with_negative:
+            if "" not in seen_predictions:
+                nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
+
+            # In very rare edge cases we could only have single null prediction.
+            # So we just create a nonce prediction in this case to avoid failure.
+            if len(nbest) == 1:
+                nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        assert len(nbest) >= 1, "No valid predictions"
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry:
+                if entry.text:
+                    best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = entry.start_logit
+            output["end_logit"] = entry.end_logit
+            try:
+                output["start_logit"] = output["start_logit"].item()
+                output["end_logit"] = output["end_logit"].item()
+            except:
+                pass
+            nbest_json.append(output)
+
+        assert len(nbest_json) >= 1, "No valid predictions"
+
+        if not version_2_with_negative:
+            all_predictions[example.qas_id] = nbest_json[0]["text"]
+        else:
+            # predict "" iff the null score - the score of best non-null > threshold
+            score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
+            scores_diff_json[example.qas_id] = score_diff
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example.qas_id] = ""
+            else:
+                all_predictions[example.qas_id] = best_non_null_entry.text
+        all_nbest_json[example.qas_id] = nbest_json
+
+    if output_prediction_file:
+        with open(output_prediction_file, "w") as writer:
+            writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    if output_nbest_file:
+        with open(output_nbest_file, "w") as writer:
+            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if output_null_log_odds_file and version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions, all_nbest_json
+
+
+def compute_predictions_log_probs(
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    start_n_top,
+    end_n_top,
+    version_2_with_negative,
+    tokenizer,
+    verbose_logging,
+):
+    """
+    XLNet write prediction logic (more complex than Bert's). Write final predictions to the json file and log-odds of
+    null if needed.
+
+    Requires utils_squad_evaluate.py
+    """
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]
+    )
+
+    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]
+    )
+
+    logger.info(f"Writing predictions to: {output_prediction_file}")
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+
+            cur_null_score = result.cls_logits
+
+            # if we could have irrelevant answers, get the min score of irrelevant
+            score_null = min(score_null, cur_null_score)
+
+            for i in range(start_n_top):
+                for j in range(end_n_top):
+                    start_log_prob = result.start_logits[i]
+                    start_index = result.start_top_index[i]
+
+                    j_index = i * end_n_top + j
+
+                    end_log_prob = result.end_logits[j_index]
+                    end_index = result.end_top_index[j_index]
+
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= feature.paragraph_len - 1:
+                        continue
+                    if end_index >= feature.paragraph_len - 1:
+                        continue
+
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_log_prob=start_log_prob,
+                            end_log_prob=end_log_prob,
+                        )
+                    )
+
+        prelim_predictions = sorted(
+            prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), reverse=True
+        )
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+
+            # XLNet un-tokenizer
+            # Let's keep it simple for now and see if we need all this later.
+            #
+            # tok_start_to_orig_index = feature.tok_start_to_orig_index
+            # tok_end_to_orig_index = feature.tok_end_to_orig_index
+            # start_orig_pos = tok_start_to_orig_index[pred.start_index]
+            # end_orig_pos = tok_end_to_orig_index[pred.end_index]
+            # paragraph_text = example.paragraph_text
+            # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
+
+            # Previously used Bert untokenizer
+            tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
+            orig_doc_start = feature.token_to_orig_map[pred.start_index]
+            orig_doc_end = feature.token_to_orig_map[pred.end_index]
+            orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
+            tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
+
+            # Clean whitespace
+            tok_text = tok_text.strip()
+            tok_text = " ".join(tok_text.split())
+            orig_text = " ".join(orig_tokens)
+
+            if hasattr(tokenizer, "do_lower_case"):
+                do_lower_case = tokenizer.do_lower_case
+            else:
+                do_lower_case = tokenizer.do_lowercase_and_remove_accent
+
+            final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
+
+            if final_text in seen_predictions:
+                continue
+
+            seen_predictions[final_text] = True
+
+            nbest.append(
+                _NbestPrediction(text=final_text, start_log_prob=pred.start_log_prob, end_log_prob=pred.end_log_prob)
+            )
+
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(_NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6))
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_log_prob + entry.end_log_prob)
+            if not best_non_null_entry:
+                best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_log_prob"] = entry.start_log_prob
+            output["end_log_prob"] = entry.end_log_prob
+            nbest_json.append(output)
+
+        assert len(nbest_json) >= 1, "No valid predictions"
+        assert best_non_null_entry is not None, "No valid predictions"
+
+        score_diff = score_null
+        scores_diff_json[example.qas_id] = score_diff
+        # note(zhiliny): always predict best_non_null_entry
+        # and the evaluation script will search for the best threshold
+        all_predictions[example.qas_id] = best_non_null_entry.text
+
+        all_nbest_json[example.qas_id] = nbest_json
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions
diff --git a/docs/experiments-cmrc.md b/docs/experiments-cmrc.md
index c551260..85a42dc 100644
--- a/docs/experiments-cmrc.md
+++ b/docs/experiments-cmrc.md
@@ -7,16 +7,16 @@
 
 We have indexed the 2018 Wikipedia Chinese dump. You can download the prepared index here:
 ```
-wget ftp://72.143.107.253/BERTserini/chinese_wiki_2018_index.zip
+#wget ftp://72.143.107.253/BERTserini/chinese_wiki_2018_index.zip
+wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/lucene-index.zhwiki-20181201-paragraphs.tar.gz
 ```
-```*index.zip``` contains the indexed latest Wikipedia dump with Anserini.
 
 After unzipping these files, put them under the root path of this repo, and then you are ready to go.
 Take the following folder structure as an example:
 ```
 bertserini
 +--- indexes
-|    +--- lucene-index.wiki_zh_paragraph_with_title_0.6.0.pos+docvectors
+|    +--- lucene-index.zhwiki-20181201-paragraphs
 |    |    +--- ...
 +--- other files under this repo
 ```
@@ -46,10 +46,10 @@ mv index.html cmrc2018_dev_squad.json
 For `rsvp-ai/bertserini-bert-base-cmrc`
 ```
 python -m bertserini.experiments.inference --dataset_path data/cmrc2018_dev_squad.json \
-                                           --index_path indexes/lucene-index.wiki_zh_paragraph_with_title_0.6.0.pos+docvectors \
+                                           --index_path indexes/lucene-index.zhwiki-20181201-paragraphs \
                                            --model_name_or_path rsvp-ai/bertserini-bert-base-cmrc \
-                                           --output prediction/cmrc2018_pred.json
-                                           --topk 10
+                                           --output prediction/cmrc2018_pred.json \
+                                           --topk 10 \
                                            --language zh
 
 ```
@@ -70,4 +70,5 @@ Expected results:
 ```
 ## rsvp-ai/bertserini-bert-base-cmrc, this is bert-base-chinese finetuned on the chinese reading comprehension dataset(CMRC)
 (0.5, {'f1_score': 68.0033167812909, 'exact_match': 51.164958061509786, 'total_count': 3219, 'skip_count': 1})
+replicated Mar 10 2022 by @amyxie361 (0.5, {'f1_score': 65.64519666259483, 'exact_match': 49.98446722584654, 'total_count': 3219, 'skip_count': 0})
 ```

From f7c79e9426b367e5248b95ce8cde2fe0854eb95e Mon Sep 17 00:00:00 2001
From: Ikram Ali <mrikram1989@gmail.com>
Date: Sat, 19 Mar 2022 04:10:14 +0500
Subject: [PATCH 32/50] update url (#22)

Update URL https://github.com/rsvp-ai/bertserini.git with https://github.com/castorini/bertserini
---
 docs/experiments-squad.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/experiments-squad.md b/docs/experiments-squad.md
index c1a7fad..f1340c4 100644
--- a/docs/experiments-squad.md
+++ b/docs/experiments-squad.md
@@ -1,6 +1,6 @@
 # Bertserini: Baseline on SQUAD QA
 
-1. Clone the repo with ```git clone https://github.com/rsvp-ai/bertserini.git```
+1. Clone the repo with ```git clone https://github.com/castorini/bertserini.git```
 2. ```pip install -r requirements.txt```
 
 ## Download PreBuilt Wikipedia Index

From bb7cffbfc430b99e70b5f5ab18b3e63775553e1b Mon Sep 17 00:00:00 2001
From: Yuqing Xie <amyxie361@outlook.com>
Date: Fri, 18 Mar 2022 23:04:38 -0400
Subject: [PATCH 33/50] add string representations for bass classes (#24)

---
 bertserini/reader/base.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/bertserini/reader/base.py b/bertserini/reader/base.py
index 75f080e..85dd442 100644
--- a/bertserini/reader/base.py
+++ b/bertserini/reader/base.py
@@ -23,6 +23,12 @@ def __init__(self, text: str, id: Optional[str] = None, language: str = "en"):
         self.id = id
         self.language = language
 
+    def __repr__(self):
+        return str(self)
+
+    def __str__(self):
+        return "<Question:{}>".format(self.text)
+
 
 class Context:
     """
@@ -53,6 +59,12 @@ def __init__(self,
         self.metadata = metadata
         self.score = score
 
+    def __repr__(self):
+        return str(self)
+
+    def __str__(self):
+        return "<Passage:{},\n score:{}>".format(self.text, self.score)
+
 
 class Answer:
     """
@@ -87,6 +99,12 @@ def __init__(self,
         self.ctx_score = ctx_score
         self.total_score = total_score
 
+    def __repr__(self):
+        return str(self)
+
+    def __str__(self):
+        return "<Answer: {}, score:{}, ctx_score:{}, total_score:{}>".format(self.text, self.score, self.ctx_score, self.total_score)
+
     def aggregate_score(self, weight):
         self.total_score = weight*self.score + (1-weight)*self.ctx_score
 

From 9a627aa3acc4a36da36dcec892768432210f33aa Mon Sep 17 00:00:00 2001
From: Yuqing Xie <amyxie361@outlook.com>
Date: Fri, 18 Mar 2022 23:21:07 -0400
Subject: [PATCH 34/50] Update apis for transformers 4.17 and also update
 pyserini api (#23)

* update api to become compatiable with HF transformers 4.17

* recover squad metric for chinese fix

* change from BertTokenizer to AutoTokenizer
---
 bertserini/experiments/args.py                |   6 +
 bertserini/reader/bert_reader.py              |  23 +-
 bertserini/retriever/pyserini_retriever.py    |  15 +-
 ...{utils_squad.py => utils_squad_metrics.py} | 324 +++++++++++++-----
 docs/experiments-cmrc.md                      |  12 +-
 test.py                                       |  26 ++
 6 files changed, 290 insertions(+), 116 deletions(-)
 rename bertserini/utils/{utils_squad.py => utils_squad_metrics.py} (69%)
 create mode 100644 test.py

diff --git a/bertserini/experiments/args.py b/bertserini/experiments/args.py
index e3a9ccb..abc389f 100644
--- a/bertserini/experiments/args.py
+++ b/bertserini/experiments/args.py
@@ -37,6 +37,12 @@
     required=True,
     help="The output file where the runs results will be written to",
 )
+parser.add_argument(
+    "--output_nbest_file",
+    default="./tmp.nbest", 
+    type=str,
+    help="The output file for store nbest results temporarily",
+)
 parser.add_argument(
     "--language",
     default="en",
diff --git a/bertserini/reader/bert_reader.py b/bertserini/reader/bert_reader.py
index 0b30a23..8641c51 100644
--- a/bertserini/reader/bert_reader.py
+++ b/bertserini/reader/bert_reader.py
@@ -1,18 +1,17 @@
 from typing import List
 
-from transformers import AutoTokenizer, AutoModelForQuestionAnswering, squad_convert_examples_to_features
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering, SquadExample
 from torch.utils.data import DataLoader, SequentialSampler
 import torch
-from transformers.data.processors.squad import SquadResult
+from transformers.data.processors.squad import SquadResult, squad_convert_examples_to_features
 
 from bertserini.reader.base import Reader, Question, Context, Answer
+from bertserini.utils.utils_squad_metrics import compute_predictions_logits
 
 __all__ = ['BERT']
 
 from bertserini.train.run_squad import to_list
 
-from bertserini.utils.utils_squad import SquadExample, compute_predictions_logits
-
 
 def craft_squad_examples(question: Question, contexts: List[Context]) -> List[SquadExample]:
     examples = []
@@ -34,12 +33,12 @@ def craft_squad_examples(question: Question, contexts: List[Context]) -> List[Sq
 
 
 class BERT(Reader):
-    def __init__(self, model_name: str, tokenizer_name: str = None):
+    def __init__(self, model_name: str, tokenizer_name: str = None, output_nbest_file=None):
         if tokenizer_name is None:
             tokenizer_name = model_name
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.model = AutoModelForQuestionAnswering.from_pretrained(model_name).to(self.device).eval()
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=True, use_fast=False)
         self.args = {
             "max_seq_length": 384,
             "doc_stride": 128,
@@ -50,7 +49,7 @@ def __init__(self, model_name: str, tokenizer_name: str = None):
             "max_answer_length": 30,
             "do_lower_case": True,
             "output_prediction_file": False,
-            "output_nbest_file": None,
+            "output_nbest_file": output_nbest_file,
             "output_null_log_odds_file": None,
             "verbose_logging": False,
             "version_2_with_negative": True,
@@ -98,14 +97,14 @@ def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
                 eval_feature = features[feature_index.item()]
                 unique_id = int(eval_feature.unique_id)
 
-                output = [to_list(output[i]) for output in outputs]
+                output = [outputs[oname][i]) for oname in outputs]
                 
                 start_logits, end_logits = output
                 result = SquadResult(unique_id, start_logits, end_logits)
 
                 all_results.append(result)
 
-        answers, _ = compute_predictions_logits(
+        answers, nbest = compute_predictions_logits(
             all_examples=examples,
             all_features=features,
             all_results=all_results,
@@ -123,10 +122,10 @@ def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
         )
 
         all_answers = []
-        for idx, ans in enumerate(answers):
+        for idx, ans in enumerate(nbest):
             all_answers.append(Answer(
-                text=answers[ans][0],
-                score=answers[ans][1],
+                text=nbest[ans][0]["text"],
+                score=nbest[ans][0]["start_logit"] + nbest[ans][0]["end_logit"],
                 ctx_score=contexts[idx].score,
                 language=question.language
             ))
diff --git a/bertserini/retriever/pyserini_retriever.py b/bertserini/retriever/pyserini_retriever.py
index 74a0545..177d90c 100644
--- a/bertserini/retriever/pyserini_retriever.py
+++ b/bertserini/retriever/pyserini_retriever.py
@@ -1,6 +1,7 @@
 from typing import List
+import json
 
-from pyserini.search import SimpleSearcher, JSimpleSearcherResult
+from pyserini.search.lucene import LuceneSearcher, JLuceneSearcherResult
 from bertserini.utils.utils import init_logger
 from bertserini.reader.base import Context
 
@@ -8,13 +9,13 @@
 
 
 def build_searcher(index_path, k1=0.9, b=0.4, language="en"):
-    searcher = SimpleSearcher(index_path)
+    searcher = LuceneSearcher(index_path)
     searcher.set_bm25(k1, b)
     searcher.object.setLanguage(language)
     return searcher
 
 def build_searcher_from_prebuilt_index(index_name, k1=0.9, b=0.4, language="en"):
-    searcher = SimpleSearcher.from_prebuilt_index(index_name)
+    searcher = LuceneSearcher.from_prebuilt_index(index_name)
     searcher.set_bm25(k1, b)
     searcher.object.setLanguage(language)
     return searcher
@@ -32,12 +33,12 @@ def retriever(question, searcher, para_num=20):
     return hits_to_contexts(hits, language)
 
 
-def hits_to_contexts(hits: List[JSimpleSearcherResult], language="en", field='raw', blacklist=[]) -> List[Context]:
+def hits_to_contexts(hits: List[JLuceneSearcherResult], language="en", field='raw', blacklist=[]) -> List[Context]:
     """
         Converts hits from Pyserini into a list of texts.
         Parameters
         ----------
-        hits : List[JSimpleSearcherResult]
+        hits : List[JLuceneSearcherResult]
             The hits.
         field : str
             Field to use.
@@ -53,6 +54,10 @@ def hits_to_contexts(hits: List[JSimpleSearcherResult], language="en", field='ra
     contexts = []
     for i in range(0, len(hits)):
         t = hits[i].raw if field == 'raw' else hits[i].contents
+        try: # the previous chinese index stores the contents as "raw", while the english index stores the json string.
+            t = json.loads(t)["contents"]
+        except:
+            pass
         for s in blacklist:
             if s in t:
                 continue
diff --git a/bertserini/utils/utils_squad.py b/bertserini/utils/utils_squad_metrics.py
similarity index 69%
rename from bertserini/utils/utils_squad.py
rename to bertserini/utils/utils_squad_metrics.py
index 7c018ab..2039836 100644
--- a/bertserini/utils/utils_squad.py
+++ b/bertserini/utils/utils_squad_metrics.py
@@ -1,98 +1,40 @@
-""" Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was
-modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was modified by XLNet authors to
+update `find_best_threshold` scripts for SQuAD V2.0
 
-In addition to basic functionality, we also compute additional statistics and
-plot precision-recall curves if an additional na_prob.json file is provided.
-This file is expected to map question ID's to the model's predicted probability
-that a question is unanswerable.
+In addition to basic functionality, we also compute additional statistics and plot precision-recall curves if an
+additional na_prob.json file is provided. This file is expected to map question ID's to the model's predicted
+probability that a question is unanswerable.
 """
 
 
 import collections
 import json
-import logging
 import math
 import re
 import string
 
-from transformers.tokenization_bert import BasicTokenizer
-
-
-logger = logging.getLogger(__name__)
+#from ...models.bert import BasicTokenizer
+#from ...utils import logging
+#from transformers.models.bert import BasicTokenizer
+from transformers import AutoTokenizer
+from transformers.utils import logging
 
 
-def _is_whitespace(c):
-    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-        return True
-    return False
-
-
-class SquadExample:
-    """
-    A single training/test example for the Squad dataset, as loaded from disk.
-    Args:
-        qas_id: The example's unique identifier
-        question_text: The question string
-        context_text: The context string
-        answer_text: The answer string
-        start_position_character: The character position of the start of the answer
-        title: The title of the example
-        answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
-        is_impossible: False by default, set to True if the example has no possible answer.
-    """
-
-    def __init__(
-        self,
-        qas_id,
-        question_text,
-        context_text,
-        answer_text,
-        start_position_character,
-        title,
-        answers=[],
-        is_impossible=False,
-        language="en",
-    ):
-        self.qas_id = qas_id
-        self.question_text = question_text
-        self.context_text = context_text
-        self.answer_text = answer_text
-        self.title = title
-        self.is_impossible = is_impossible
-        self.answers = answers
-
-        self.start_position, self.end_position = 0, 0
-
-        doc_tokens = []
-        char_to_word_offset = []
-        prev_is_whitespace = True
-
-        # Split on whitespace so that different tokens may be attributed to their original position.
-        if language == "zh":
-            for tok_id, c in enumerate(self.context_text):
-                doc_tokens.append(c)
-                char_to_word_offset.append(len(doc_tokens) - 1)
-        else:
-            for c in self.context_text:
-                if _is_whitespace(c):
-                    prev_is_whitespace = True
-                else:
-                    if prev_is_whitespace:
-                        doc_tokens.append(c)
-                    else:
-                        doc_tokens[-1] += c
-                    prev_is_whitespace = False
-                char_to_word_offset.append(len(doc_tokens) - 1)
-
-        self.doc_tokens = doc_tokens
-        self.char_to_word_offset = char_to_word_offset
-
-        # Start and end positions only has a value during evaluation.
-        if start_position_character is not None and not is_impossible:
-            self.start_position = char_to_word_offset[start_position_character]
-            self.end_position = char_to_word_offset[
-                min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
-            ]
+logger = logging.get_logger(__name__)
 
 
 def normalize_answer(s):
@@ -157,7 +99,7 @@ def get_raw_scores(examples, preds):
             gold_answers = [""]
 
         if qas_id not in preds:
-            print("Missing prediction for %s" % qas_id)
+            print(f"Missing prediction for {qas_id}")
             continue
 
         prediction = preds[qas_id]
@@ -201,7 +143,7 @@ def make_eval_dict(exact_scores, f1_scores, qid_list=None):
 
 def merge_eval(main_eval, new_eval, prefix):
     for k in new_eval:
-        main_eval["%s_%s" % (prefix, k)] = new_eval[k]
+        main_eval[f"{prefix}_{k}"] = new_eval[k]
 
 
 def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
@@ -313,7 +255,7 @@ def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_
     return evaluation
 
 
-def get_final_text(pred_text, orig_text, do_lower_case, language="zh", verbose_logging=False):
+def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False, language="en", tokenizer_name="rsvp-ai/bertserini-bert-base-squad"):
     """Project the tokenized prediction back to the original text."""
 
     # When we created the data, we kept track of the alignment between original
@@ -356,7 +298,9 @@ def _strip_spaces(text):
     # and `pred_text`, and check if they are the same length. If they are
     # NOT the same length, the heuristic has failed. If they are the same
     # length, we assume the characters are one-to-one aligned.
-    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+    #tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=False)
+
     if language=="zh":
         tok_text = "".join(tokenizer.tokenize(orig_text))
     else:
@@ -365,7 +309,7 @@ def _strip_spaces(text):
     start_position = tok_text.find(pred_text)
     if start_position == -1:
         if verbose_logging:
-            logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+            logger.info(f"Unable to find text: '{pred_text}' in '{orig_text}'")
         return orig_text
     end_position = start_position + len(pred_text) - 1
 
@@ -374,7 +318,7 @@ def _strip_spaces(text):
 
     if len(orig_ns_text) != len(tok_ns_text):
         if verbose_logging:
-            logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text)
+            logger.info(f"Length not equal after stripping spaces: '{orig_ns_text}' vs '{tok_ns_text}'")
         return orig_text
 
     # We then project the characters in `pred_text` back to `orig_text` using
@@ -579,9 +523,10 @@ def compute_predictions_logits(
                     tok_text = " ".join(tok_text.split())
                     orig_text = " ".join(orig_tokens)
 
-                final_text = get_final_text(tok_text, orig_text, do_lower_case, language, verbose_logging)
+                final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging, language=language)
                 if "##" in final_text or "[UNK]" in final_text:
                     print(final_text, "||", tok_text, "||", orig_text)
+
                 if final_text in seen_predictions:
                     continue
 
@@ -616,7 +561,7 @@ def compute_predictions_logits(
                 if entry.text:
                     best_non_null_entry = entry
 
-        # probs = _compute_softmax(total_scores)
+        #probs = _compute_softmax(total_scores)
         probs = total_scores
 
         nbest_json = []
@@ -626,13 +571,18 @@ def compute_predictions_logits(
             output["probability"] = probs[i]
             output["start_logit"] = entry.start_logit
             output["end_logit"] = entry.end_logit
+            try:
+                output["start_logit"] = output["start_logit"].item()
+                output["end_logit"] = output["end_logit"].item()
+            except:
+                pass
             nbest_json.append(output)
 
         assert len(nbest_json) >= 1, "No valid predictions"
 
         if not version_2_with_negative:
             all_predictions[example.qas_id] = (nbest_json[0]["text"],
-                                               nbest_json[0]['probability'])
+                    nbest_json[0]['probability'])
         else:
             # predict "" iff the null score - the score of best non-null > threshold
             score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
@@ -643,7 +593,6 @@ def compute_predictions_logits(
                 all_predictions[example.qas_id] = (
                     best_non_null_entry.text,
                     best_non_null_entry.start_logit + best_non_null_entry.end_logit)
-
         all_nbest_json[example.qas_id] = nbest_json
 
     if output_prediction_file:
@@ -660,3 +609,192 @@ def compute_predictions_logits(
 
     return all_predictions, all_nbest_json
 
+
+def compute_predictions_log_probs(
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    start_n_top,
+    end_n_top,
+    version_2_with_negative,
+    tokenizer,
+    verbose_logging,
+):
+    """
+    XLNet write prediction logic (more complex than Bert's). Write final predictions to the json file and log-odds of
+    null if needed.
+
+    Requires utils_squad_evaluate.py
+    """
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]
+    )
+
+    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]
+    )
+
+    logger.info(f"Writing predictions to: {output_prediction_file}")
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+
+            cur_null_score = result.cls_logits
+
+            # if we could have irrelevant answers, get the min score of irrelevant
+            score_null = min(score_null, cur_null_score)
+
+            for i in range(start_n_top):
+                for j in range(end_n_top):
+                    start_log_prob = result.start_logits[i]
+                    start_index = result.start_top_index[i]
+
+                    j_index = i * end_n_top + j
+
+                    end_log_prob = result.end_logits[j_index]
+                    end_index = result.end_top_index[j_index]
+
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= feature.paragraph_len - 1:
+                        continue
+                    if end_index >= feature.paragraph_len - 1:
+                        continue
+
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_log_prob=start_log_prob,
+                            end_log_prob=end_log_prob,
+                        )
+                    )
+
+        prelim_predictions = sorted(
+            prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), reverse=True
+        )
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+
+            # XLNet un-tokenizer
+            # Let's keep it simple for now and see if we need all this later.
+            #
+            # tok_start_to_orig_index = feature.tok_start_to_orig_index
+            # tok_end_to_orig_index = feature.tok_end_to_orig_index
+            # start_orig_pos = tok_start_to_orig_index[pred.start_index]
+            # end_orig_pos = tok_end_to_orig_index[pred.end_index]
+            # paragraph_text = example.paragraph_text
+            # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
+
+            # Previously used Bert untokenizer
+            tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
+            orig_doc_start = feature.token_to_orig_map[pred.start_index]
+            orig_doc_end = feature.token_to_orig_map[pred.end_index]
+            orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
+            tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
+
+            # Clean whitespace
+            tok_text = tok_text.strip()
+            tok_text = " ".join(tok_text.split())
+            orig_text = " ".join(orig_tokens)
+
+            if hasattr(tokenizer, "do_lower_case"):
+                do_lower_case = tokenizer.do_lower_case
+            else:
+                do_lower_case = tokenizer.do_lowercase_and_remove_accent
+
+            final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
+
+            if final_text in seen_predictions:
+                continue
+
+            seen_predictions[final_text] = True
+
+            nbest.append(
+                _NbestPrediction(text=final_text, start_log_prob=pred.start_log_prob, end_log_prob=pred.end_log_prob)
+            )
+
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(_NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6))
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_log_prob + entry.end_log_prob)
+            if not best_non_null_entry:
+                best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_log_prob"] = entry.start_log_prob
+            output["end_log_prob"] = entry.end_log_prob
+            nbest_json.append(output)
+
+        assert len(nbest_json) >= 1, "No valid predictions"
+        assert best_non_null_entry is not None, "No valid predictions"
+
+        score_diff = score_null
+        scores_diff_json[example.qas_id] = score_diff
+        # note(zhiliny): always predict best_non_null_entry
+        # and the evaluation script will search for the best threshold
+        all_predictions[example.qas_id] = best_non_null_entry.text
+
+        all_nbest_json[example.qas_id] = nbest_json
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions
diff --git a/docs/experiments-cmrc.md b/docs/experiments-cmrc.md
index c551260..b57a638 100644
--- a/docs/experiments-cmrc.md
+++ b/docs/experiments-cmrc.md
@@ -7,16 +7,15 @@
 
 We have indexed the 2018 Wikipedia Chinese dump. You can download the prepared index here:
 ```
-wget ftp://72.143.107.253/BERTserini/chinese_wiki_2018_index.zip
+wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/lucene-index.zhwiki-20181201-paragraphs.tar.gz
 ```
-```*index.zip``` contains the indexed latest Wikipedia dump with Anserini.
 
 After unzipping these files, put them under the root path of this repo, and then you are ready to go.
 Take the following folder structure as an example:
 ```
 bertserini
 +--- indexes
-|    +--- lucene-index.wiki_zh_paragraph_with_title_0.6.0.pos+docvectors
+|    +--- lucene-index.zhwiki-20181201-paragraphs
 |    |    +--- ...
 +--- other files under this repo
 ```
@@ -46,10 +45,10 @@ mv index.html cmrc2018_dev_squad.json
 For `rsvp-ai/bertserini-bert-base-cmrc`
 ```
 python -m bertserini.experiments.inference --dataset_path data/cmrc2018_dev_squad.json \
-                                           --index_path indexes/lucene-index.wiki_zh_paragraph_with_title_0.6.0.pos+docvectors \
+                                           --index_path indexes/lucene-index.zhwiki-20181201-paragraphs \
                                            --model_name_or_path rsvp-ai/bertserini-bert-base-cmrc \
-                                           --output prediction/cmrc2018_pred.json
-                                           --topk 10
+                                           --output prediction/cmrc2018_pred.json \
+                                           --topk 10 \
                                            --language zh
 
 ```
@@ -70,4 +69,5 @@ Expected results:
 ```
 ## rsvp-ai/bertserini-bert-base-cmrc, this is bert-base-chinese finetuned on the chinese reading comprehension dataset(CMRC)
 (0.5, {'f1_score': 68.0033167812909, 'exact_match': 51.164958061509786, 'total_count': 3219, 'skip_count': 1})
+replicated Mar 10 2022 by @amyxie361 (0.5, {'f1_score': 65.64519666259483, 'exact_match': 49.98446722584654, 'total_count': 3219, 'skip_count': 0})
 ```
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..c2c3c35
--- /dev/null
+++ b/test.py
@@ -0,0 +1,26 @@
+
+from bertserini.reader.base import Question, Context
+from bertserini.reader.bert_reader import BERT
+from bertserini.utils.utils_new import get_best_answer
+
+model_name = "rsvp-ai/bertserini-bert-base-squad"
+tokenizer_name = "rsvp-ai/bertserini-bert-base-squad"
+bert_reader = BERT(model_name, tokenizer_name)
+
+question = Question("Why did Mark Twain call the 19th century the glied age?")
+
+contexts = [Context('The "Gilded Age" was a term that Mark Twain used to describe the period of the late 19th century when there had been a dramatic expansion of American wealth and prosperity.')]
+
+candidates = bert_reader.predict(question, contexts)
+answer = get_best_answer(candidates, 0.45)
+print(answer.text)
+print("local context passed")
+
+from bertserini.retriever.pyserini_retriever import retriever, build_searcher
+searcher = build_searcher("indexes/lucene-index.enwiki-20180701")
+contexts = retriever(question, searcher, 10)
+candidates = bert_reader.predict(question, contexts)
+answer = get_best_answer(candidates, 0.45)
+print(answer.text)
+print("e2e context passed")
+

From d84e462aae070ad6c7c54939018c550732d2728e Mon Sep 17 00:00:00 2001
From: Ikram Ali <mrikram1989@gmail.com>
Date: Sun, 20 Mar 2022 08:34:54 +0500
Subject: [PATCH 35/50] bug fixed, (#25)

bug fixed,
---
 bertserini/reader/bert_reader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bertserini/reader/bert_reader.py b/bertserini/reader/bert_reader.py
index 8641c51..82acd60 100644
--- a/bertserini/reader/bert_reader.py
+++ b/bertserini/reader/bert_reader.py
@@ -97,7 +97,7 @@ def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
                 eval_feature = features[feature_index.item()]
                 unique_id = int(eval_feature.unique_id)
 
-                output = [outputs[oname][i]) for oname in outputs]
+                output = [outputs[oname][i] for oname in outputs]
                 
                 start_logits, end_logits = output
                 result = SquadResult(unique_id, start_logits, end_logits)

From 5ef06451e7dd5eb56a8cbcc1c4a728ded2971d5e Mon Sep 17 00:00:00 2001
From: Yuqing Xie <amyxie361@outlook.com>
Date: Sun, 20 Mar 2022 22:54:40 -0400
Subject: [PATCH 36/50] Add dpr retriever (#26)

* support dpr retriever

* move retriever arguments to args.py

* add more test cases

* add DPR explain in README.md
---
 README.md                                  | 29 +++++++++
 bertserini/experiments/args.py             | 55 ++++++++++++++--
 bertserini/experiments/evaluate.py         |  9 +--
 bertserini/experiments/inference.py        |  5 +-
 bertserini/reader/base.py                  |  2 +
 bertserini/reader/bert_reader.py           | 24 ++++---
 bertserini/retriever/pyserini_retriever.py | 66 +++++++++++++------
 test.py                                    | 75 +++++++++++++++++-----
 8 files changed, 206 insertions(+), 59 deletions(-)

diff --git a/README.md b/README.md
index df17789..e1d82ab 100644
--- a/README.md
+++ b/README.md
@@ -77,6 +77,35 @@ We have evaluated our system on `SQuAD 1.1` and `CMRC2018` development set.
 Please see following documents for details:  
 - [SQuAD experiments](docs/experiments-squad.md)  
 - [CMRC experiments](docs/experiments-cmrc.md)
+
+## DPR supporting
+
+We enabled DPR retriever with pyserini indexed corpus.
+The corpus is created from the command:
+```
+python -m pyserini.encode \
+    input   --corpus <original_corpus_dir> \
+            --delimiter "DoNotApplyDelimiterPlease" \
+            --shard-id 0 \
+            --shard-num 1 \
+    output  --embeddings dpr-ctx_encoder-multiset-base.corpus \
+            --to-faiss \
+    encoder --encoder facebook/dpr-ctx_encoder-multiset-base \
+            --batch-size 16 \
+            --device cuda:0 \
+            --fp16  # if inference with autocast()
+```
+
+When enable dpr option in e2e inference, please set the following arguments:
+
+```
+--retriever dpr \
+--encoder <path to dpr query encoder> \
+--index_path <pyserini indexed dpr dir> \
+--sparse_index <bm25 indexed corpus dir> \ # the dense index doesn't store the raw text, we need to get the original text from the sparse index
+--device cuda:0
+```
+
 ## Citation
 
 Please cite [the NAACL 2019 paper]((https://www.aclweb.org/anthology/N19-4013/)):
diff --git a/bertserini/experiments/args.py b/bertserini/experiments/args.py
index abc389f..828f773 100644
--- a/bertserini/experiments/args.py
+++ b/bertserini/experiments/args.py
@@ -2,26 +2,64 @@
 
 parser = argparse.ArgumentParser()
 
+parser.add_argument(
+    "--device",
+    default="cpu",
+    type=str,
+    help="Device to run query encoder, cpu or [cuda:0, cuda:1, ...]",
+)
 parser.add_argument(
     "--dataset_path",
     default=None,
     type=str,
-    required=True,
     help="Path to the [dev, test] dataset",
 )
-
+parser.add_argument(
+    "--retriever",
+    default="bm25",
+    type=str,
+    help="define the indexer type",
+)
+parser.add_argument(
+    "--k1",
+    default=0.9,
+    type=float,
+    help="k1, parameter for bm25 retriever",
+)
+parser.add_argument(
+    "--b",
+    default=0.4,
+    type=float,
+    help="b, parameter for bm25 retriever",
+)
+parser.add_argument(
+    "--encoder",
+    default="facebook/dpr-question_encoder-multiset-base",
+    type=str,
+    help="dpr encoder path or name",
+)
+parser.add_argument(
+    "--query_tokenizer_name",
+    default=None,
+    type=str,
+    help="tokenizer for dpr encoder",
+)
 parser.add_argument(
     "--index_path",
     default=None,
     type=str,
-    required=True,
     help="Path to the indexes of contexts",
 )
+parser.add_argument(
+    "--sparse_index",
+    default=None,
+    type=str,
+    help="Path to the indexes of sarse tokenizer, required when using dense index, in order to retrieve the raw document",
+)
 parser.add_argument(
     "--model_name_or_path",
     default=None,
     type=str,
-    required=True,
     help="Path to pretrained model or model identifier from huggingface.co/models",
 )
 parser.add_argument(
@@ -34,12 +72,11 @@
     "--output",
     default=None,
     type=str,
-    required=True,
     help="The output file where the runs results will be written to",
 )
 parser.add_argument(
     "--output_nbest_file",
-    default="./tmp.nbest", 
+    default=None, 
     type=str,
     help="The output file for store nbest results temporarily",
 )
@@ -49,6 +86,12 @@
     type=str,
     help="The language of task",
 )
+parser.add_argument(
+    "--eval_batch_size",
+    default=32,
+    type=int,
+    help="batch size for evaluation",
+)
 parser.add_argument(
     "--topk",
     default=10,
diff --git a/bertserini/experiments/evaluate.py b/bertserini/experiments/evaluate.py
index 9021302..848edea 100755
--- a/bertserini/experiments/evaluate.py
+++ b/bertserini/experiments/evaluate.py
@@ -55,16 +55,17 @@ def get_score_with_results(eval_data, predictions, mu, dataset):
     return eval_result, answers
 
 
-def get_best_mu_with_scores(eval_data, predictions, mu_range, dataset, output_path):
+def get_best_mu_with_scores(eval_data, predictions, mu_range, dataset, output_path, standard="f1"): 
+    # standard = "f1" or "exact_match"
     score_test = {}
     best_mu = 0
-    best_em = 0
+    best_score = 0
     for mu in mu_range:
         eval_result, answers = get_score_with_results(eval_data, predictions, mu, dataset)
         score_test[mu] = eval_result
-        if eval_result["exact_match"] > best_em:
+        if eval_result[standard] > best_score:
             best_mu = mu
-            best_em = eval_result['exact_match']
+            best_score = eval_result[standard]
             json.dump(answers, open(output_path + "/prediction.json", 'w'))
 
     json.dump(score_test, open(output_path + "/score.json", 'w'))
diff --git a/bertserini/experiments/inference.py b/bertserini/experiments/inference.py
index de6d042..a64e5af 100644
--- a/bertserini/experiments/inference.py
+++ b/bertserini/experiments/inference.py
@@ -7,8 +7,9 @@
 
 if __name__ == "__main__":
     questions = extract_squad_questions(args.dataset_path)
-    bert_reader = BERT(args.model_name_or_path, args.tokenizer_name)
-    searcher = build_searcher(args.index_path, language=args.language)
+    #bert_reader = BERT(args.model_name_or_path, args.tokenizer_name)
+    bert_reader = BERT(args)
+    searcher = build_searcher(args)
 
     all_answer = []
     for question in tqdm(questions):
diff --git a/bertserini/reader/base.py b/bertserini/reader/base.py
index 85dd442..83dcf1e 100644
--- a/bertserini/reader/base.py
+++ b/bertserini/reader/base.py
@@ -49,10 +49,12 @@ class Context:
 
     def __init__(self,
                  text: str,
+                 title: Optional[str] = "",
                  language: str = "en",
                  metadata: Mapping[str, Any] = None,
                  score: Optional[float] = 0):
         self.text = text
+        self.title = title
         self.language = language
         if metadata is None:
             metadata = dict()
diff --git a/bertserini/reader/bert_reader.py b/bertserini/reader/bert_reader.py
index 82acd60..e0f132f 100644
--- a/bertserini/reader/bert_reader.py
+++ b/bertserini/reader/bert_reader.py
@@ -26,19 +26,19 @@ def craft_squad_examples(question: Question, contexts: List[Context]) -> List[Sq
                 title="",
                 is_impossible=False,
                 answers=[],
-                language=ctx.language
             )
         )
     return examples
 
 
 class BERT(Reader):
-    def __init__(self, model_name: str, tokenizer_name: str = None, output_nbest_file=None):
-        if tokenizer_name is None:
-            tokenizer_name = model_name
+    def __init__(self, args):
+        self.model_args = args
+        if self.model_args.tokenizer_name is None:
+            self.model_args.tokenizer_name = self.model_args.model_name_or_path
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        self.model = AutoModelForQuestionAnswering.from_pretrained(model_name).to(self.device).eval()
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=True, use_fast=False)
+        self.model = AutoModelForQuestionAnswering.from_pretrained(self.model_args.model_name_or_path).to(self.device).eval()
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_args.tokenizer_name, do_lower_case=True, use_fast=False)
         self.args = {
             "max_seq_length": 384,
             "doc_stride": 128,
@@ -49,7 +49,7 @@ def __init__(self, model_name: str, tokenizer_name: str = None, output_nbest_fil
             "max_answer_length": 30,
             "do_lower_case": True,
             "output_prediction_file": False,
-            "output_nbest_file": output_nbest_file,
+            "output_nbest_file": self.model_args.output_nbest_file,
             "output_null_log_odds_file": None,
             "verbose_logging": False,
             "version_2_with_negative": True,
@@ -77,7 +77,7 @@ def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
 
         # Note that DistributedSampler samples randomly
         eval_sampler = SequentialSampler(dataset)
-        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=32)
+        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.model_args.eval_batch_size)
 
         all_results = []
 
@@ -98,8 +98,14 @@ def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
                 unique_id = int(eval_feature.unique_id)
 
                 output = [outputs[oname][i] for oname in outputs]
+                start_logits = outputs.start_logits[i]
+                end_logits = outputs.end_logits[i]
+                try:
+                    start_logits = start_logits.item()
+                    end_logits = end_logits.item()
+                except:
+                    pass
                 
-                start_logits, end_logits = output
                 result = SquadResult(unique_id, start_logits, end_logits)
 
                 all_results.append(result)
diff --git a/bertserini/retriever/pyserini_retriever.py b/bertserini/retriever/pyserini_retriever.py
index 177d90c..16aefea 100644
--- a/bertserini/retriever/pyserini_retriever.py
+++ b/bertserini/retriever/pyserini_retriever.py
@@ -1,6 +1,7 @@
 from typing import List
 import json
 
+from pyserini.search import FaissSearcher, DprQueryEncoder
 from pyserini.search.lucene import LuceneSearcher, JLuceneSearcherResult
 from bertserini.utils.utils import init_logger
 from bertserini.reader.base import Context
@@ -8,28 +9,50 @@
 logger = init_logger("retriever")
 
 
-def build_searcher(index_path, k1=0.9, b=0.4, language="en"):
-    searcher = LuceneSearcher(index_path)
-    searcher.set_bm25(k1, b)
-    searcher.object.setLanguage(language)
+def build_searcher(args):
+    if args.retriever == "bm25":
+        searcher = LuceneSearcher(args.index_path)
+        searcher.set_bm25(args.k1, args.b)
+        searcher.object.setLanguage(args.language)
+    elif args.retriever == "dpr":
+        query_encoder = DprQueryEncoder(
+            encoder_dir=args.encoder,
+            tokenizer_name=args.query_tokenizer_name,
+            device=args.device)
+        searcher = FaissSearcher(args.index_path, query_encoder)
+        ssearcher = LuceneSearcher(args.sparse_index)
+        searcher.ssearcher = ssearcher
+    else:
+        raise Exception("Non-Defined Retriever:", args.retriever)
     return searcher
 
-def build_searcher_from_prebuilt_index(index_name, k1=0.9, b=0.4, language="en"):
-    searcher = LuceneSearcher.from_prebuilt_index(index_name)
-    searcher.set_bm25(k1, b)
-    searcher.object.setLanguage(language)
+def build_searcher_from_prebuilt_index(args):
+    if args.retriever == "bm25":
+        searcher = LuceneSearcher.from_prebuilt_index(args.index_path)
+        searcher.set_bm25(args.k1, args.b)
+        searcher.object.setLanguage(args.language)
+    else:
+        raise Exception("Not implemented regriever from prebuilt index:", args.retirever)
     return searcher
 
 def retriever(question, searcher, para_num=20):
     language = question.language
-    try:
-        if language == "zh":
-            hits = searcher.search(question.text.encode("utf-8"), k=para_num)
-        else:
-            hits = searcher.search(question.text, k=para_num)
-    except ValueError as e:
-        logger.error("Search failure: {}, {}".format(question.text, e))
-        return []
+    if type(searcher) == FaissSearcher:
+        results = searcher.search(question.text, para_num)
+        hits = []
+        for r in results:
+            hit = searcher.doc(r.docid).get("raw") 
+            hits.append((hit, r.score))
+    else:
+        try:
+            if language == "zh":
+                hits = searcher.search(question.text.encode("utf-8"), k=para_num)
+            else:
+                hits = searcher.search(question.text, k=para_num)
+        except ValueError as e:
+            logger.error("Search failure: {}, {}".format(question.text, e))
+            return []
+        hits = [(h.raw, h.score) for h in hits]
     return hits_to_contexts(hits, language)
 
 
@@ -53,14 +76,15 @@ def hits_to_contexts(hits: List[JLuceneSearcherResult], language="en", field='ra
      """
     contexts = []
     for i in range(0, len(hits)):
-        t = hits[i].raw if field == 'raw' else hits[i].contents
+        hit, score = hits[i]
         try: # the previous chinese index stores the contents as "raw", while the english index stores the json string.
-            t = json.loads(t)["contents"]
+            t = json.loads(hit)["contents"]
         except:
-            pass
+            t = hit
         for s in blacklist:
             if s in t:
                 continue
-        metadata = {'raw': hits[i].raw, 'docid': hits[i].docid}
-        contexts.append(Context(t, language, metadata, hits[i].score))
+        #metadata = {'raw': hits.raw, 'docid': hits.docid}
+        metadata = {}
+        contexts.append(Context(t, language, metadata, score))
     return contexts
diff --git a/test.py b/test.py
index c2c3c35..ee97059 100644
--- a/test.py
+++ b/test.py
@@ -1,26 +1,67 @@
-
 from bertserini.reader.base import Question, Context
 from bertserini.reader.bert_reader import BERT
 from bertserini.utils.utils_new import get_best_answer
+from bertserini.experiments.args import *
+from bertserini.retriever.pyserini_retriever import retriever, build_searcher
 
-model_name = "rsvp-ai/bertserini-bert-base-squad"
-tokenizer_name = "rsvp-ai/bertserini-bert-base-squad"
-bert_reader = BERT(model_name, tokenizer_name)
+do_english_test = True
+do_local_test = True
+do_bm25_test = True
+do_dpr_test = True
+do_chinese_test = True
 
-question = Question("Why did Mark Twain call the 19th century the glied age?")
+if do_english_test:
+    args.model_name_or_path = "rsvp-ai/bertserini-bert-base-squad"
+    args.tokenizer_name = "rsvp-ai/bertserini-bert-base-squad"
+    bert_reader = BERT(args)
+    print("Question: Why did Mark Twain call the 19th century the glied age?")
 
-contexts = [Context('The "Gilded Age" was a term that Mark Twain used to describe the period of the late 19th century when there had been a dramatic expansion of American wealth and prosperity.')]
+if do_local_test:
+    print("######################### Testing Local Context #########################")
+    question = Question("Why did Mark Twain call the 19th century the glied age?")
+    contexts = [Context('The "Gilded Age" was a term that Mark Twain used to describe the period of the late 19th century when there had been a dramatic expansion of American wealth and prosperity.')]
+    candidates = bert_reader.predict(question, contexts)
+    answer = get_best_answer(candidates, 1.0)
+    print("Answer:", answer.text)
+    print("Local Context Test Passed")
 
-candidates = bert_reader.predict(question, contexts)
-answer = get_best_answer(candidates, 0.45)
-print(answer.text)
-print("local context passed")
+if do_bm25_test:
+    print("######################### Testing BM25 Context #########################")
+    args.index_path = "/data/y247xie/01_exps/anserini/lucene-index.ik-nlp-22"
+    searcher = build_searcher(args)
+    contexts = retriever(question, searcher, 10)
+    candidates = bert_reader.predict(question, contexts)
+    answer = get_best_answer(candidates, 0.45)
+    print("Answer:", answer.text)
+    print("BM25 Test Passed")
 
-from bertserini.retriever.pyserini_retriever import retriever, build_searcher
-searcher = build_searcher("indexes/lucene-index.enwiki-20180701")
-contexts = retriever(question, searcher, 10)
-candidates = bert_reader.predict(question, contexts)
-answer = get_best_answer(candidates, 0.45)
-print(answer.text)
-print("e2e context passed")
+if do_dpr_test:
+    print("######################### Testing DPR Context #########################")
+    args.retriever = "dpr"
+    args.encoder = "facebook/dpr-question_encoder-multiset-base"
+    args.query_tokenizer_name = "facebook/dpr-question_encoder-multiset-base"
+    args.index_path = "/data/y247xie/01_exps/pyserini/dpr-ctx_encoder-multiset-base.ik-nlp-22_slp"
+    args.device = "cuda:0"
+    args.sparse_index = "/data/y247xie/01_exps/anserini/lucene-index.ik-nlp-22"
+    searcher = build_searcher(args)
+    contexts = retriever(question, searcher, 10)
+    candidates = bert_reader.predict(question, contexts)
+    answer = get_best_answer(candidates, 0.45)
+    print("Answer:", answer.text)
+    print("DPR Test Passed")
 
+if do_chinese_test:
+    print("######################### Testing BM25 Chinese #########################")
+    args.model_name_or_path = "rsvp-ai/bertserini-bert-base-cmrc"
+    args.tokenizer_name = "rsvp-ai/bertserini-bert-base-cmrc"
+    bert_reader = BERT(args)
+    args.index_path = "./indexes/lucene-index.zhwiki-20181201-paragraphs"
+    args.language = "zh"
+    args.retriever = "bm25"
+    question = Question("《战国无双3》是由哪两个公司合作开发的？")
+    searcher = build_searcher(args)
+    contexts = retriever(question, searcher, 10)
+    candidates = bert_reader.predict(question, contexts)
+    answer = get_best_answer(candidates, 0.45)
+    print("Answer:", answer.text)
+    print("BM25 Chinese Test Passed")

From 58f33eb3dd6c8d5ff9c6b7373216aeb97a77f458 Mon Sep 17 00:00:00 2001
From: Ikram Ali <mrikram1989@gmail.com>
Date: Mon, 21 Mar 2022 08:28:48 +0500
Subject: [PATCH 37/50] Update requirements.txt (#27)

* Update requirements.txt

* Update transformers version

Co-authored-by: Yuqing Xie <amyxie361@gmail.com>
---
 requirements.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 59bd6b6..8f617f6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,11 @@
 tqdm>=4.45.0
 numpy>=1.18.5
-pyserini==0.9.4.0
-transformers==3.4.0
+pyserini==0.16.0
+transformers==4.17.0
 torch==1.5.1
 torchvision==0.6.1
 tensorboardX>=2.1
 hanziconv>=0.3.2
 nltk
+faiss-cpu
+zhon

From 6cf264d13efb599a4179707679af1b207d322c7e Mon Sep 17 00:00:00 2001
From: Yuqing Xie <amyxie361@outlook.com>
Date: Mon, 21 Mar 2022 14:45:41 -0400
Subject: [PATCH 38/50] add dpr reader (#28)

---
 bertserini/reader/dpr_reader.py | 136 ++++++++++++++++++++++++++++++++
 test.py                         |  15 +++-
 2 files changed, 147 insertions(+), 4 deletions(-)
 create mode 100644 bertserini/reader/dpr_reader.py

diff --git a/bertserini/reader/dpr_reader.py b/bertserini/reader/dpr_reader.py
new file mode 100644
index 0000000..d39f33c
--- /dev/null
+++ b/bertserini/reader/dpr_reader.py
@@ -0,0 +1,136 @@
+from typing import List
+
+import torch
+from torch.utils.data import DataLoader, SequentialSampler
+
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering, SquadExample
+from transformers.data.processors.squad import SquadResult, squad_convert_examples_to_features
+from transformers.models.dpr import DPRReader, DPRReaderTokenizer, DPRQuestionEncoderTokenizer
+
+from bertserini.reader.base import Reader, Question, Context, Answer
+from bertserini.utils.utils_squad_metrics import compute_predictions_logits
+from bertserini.train.run_squad import to_list
+
+def craft_squad_examples(question: Question, contexts: List[Context]) -> List[SquadExample]:
+    examples = []
+    for idx, ctx in enumerate(contexts):
+        examples.append(
+            SquadExample(
+                qas_id=idx,
+                question_text=question.text,
+                context_text=ctx.text,
+                answer_text=None,
+                start_position_character=None,
+                title="",
+                is_impossible=False,
+                answers=[],
+            )
+        )
+    return examples
+
+
+class DPR(Reader):
+    def __init__(self, args):
+        self.model_args = args
+        if self.model_args.tokenizer_name is None:
+            self.model_args.tokenizer_name = self.model_args.model_name_or_path
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.model = DPRReader.from_pretrained(self.model_args.model_name_or_path).to(self.device).eval()
+        self.tokenizer = DPRReaderTokenizer.from_pretrained(self.model_args.tokenizer_name, do_lower_case=True, use_fast=False)
+        self.args = {
+            "max_seq_length": 384,
+            "doc_stride": 128,
+            "max_query_length": 64,
+            "threads": 1,
+            "tqdm_enabled": False,
+            "n_best_size": 20,
+            "max_answer_length": 30,
+            "do_lower_case": True,
+            "output_prediction_file": False,
+            "output_nbest_file": self.model_args.output_nbest_file,
+            "output_null_log_odds_file": None,
+            "verbose_logging": False,
+            "version_2_with_negative": True,
+            "null_score_diff_threshold": 0,
+        }
+
+    def update_args(self, args_to_change):
+        for key in args_to_change:
+            self.args[key] = args_to_change[key]
+
+    def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
+        examples = craft_squad_examples(question, contexts)
+
+        features, dataset = squad_convert_examples_to_features(
+            examples=examples,
+            tokenizer=self.tokenizer,
+            max_seq_length=self.args["max_seq_length"],
+            doc_stride=self.args["doc_stride"],
+            max_query_length=self.args["max_query_length"],
+            is_training=False,
+            return_dataset="pt",
+            threads=self.args["threads"],
+            tqdm_enabled=self.args["tqdm_enabled"]
+        )
+
+        # Note that DistributedSampler samples randomly
+        eval_sampler = SequentialSampler(dataset)
+        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.model_args.eval_batch_size)
+
+        all_results = []
+
+        for batch in eval_dataloader:
+            self.model.eval()
+            batch = tuple(t.to(self.device) for t in batch)
+            with torch.no_grad():
+                inputs = {
+                    "input_ids": batch[0],
+                    "attention_mask": batch[1],
+                }
+                feature_indices = batch[3]
+                outputs = self.model(**inputs)
+
+            for i, feature_index in enumerate(feature_indices):
+                eval_feature = features[feature_index.item()]
+                unique_id = int(eval_feature.unique_id)
+
+                output = [outputs[oname][i] for oname in outputs]
+                start_logits = outputs.start_logits[i]
+                end_logits = outputs.end_logits[i]
+                try:
+                    start_logits = start_logits.item()
+                    end_logits = end_logits.item()
+                except:
+                    pass
+                
+                result = SquadResult(unique_id, start_logits, end_logits)
+
+                all_results.append(result)
+
+        answers, nbest = compute_predictions_logits(
+            all_examples=examples,
+            all_features=features,
+            all_results=all_results,
+            n_best_size=self.args["n_best_size"],
+            max_answer_length=self.args["max_answer_length"],
+            do_lower_case=self.args["do_lower_case"],
+            output_prediction_file=self.args["output_prediction_file"],
+            output_nbest_file=self.args["output_nbest_file"],
+            output_null_log_odds_file=self.args["output_null_log_odds_file"],
+            verbose_logging=self.args["verbose_logging"],
+            version_2_with_negative=self.args["version_2_with_negative"],
+            null_score_diff_threshold=self.args["null_score_diff_threshold"],
+            tokenizer=self.tokenizer,
+            language=question.language
+        )
+
+        all_answers = []
+        for idx, ans in enumerate(nbest):
+            all_answers.append(Answer(
+                text=nbest[ans][0]["text"],
+                score=nbest[ans][0]["start_logit"] + nbest[ans][0]["end_logit"],
+                ctx_score=contexts[idx].score,
+                language=question.language
+            ))
+        return all_answers
+
diff --git a/test.py b/test.py
index ee97059..6098f13 100644
--- a/test.py
+++ b/test.py
@@ -1,20 +1,27 @@
 from bertserini.reader.base import Question, Context
 from bertserini.reader.bert_reader import BERT
+from bertserini.reader.dpr_reader import DPR
 from bertserini.utils.utils_new import get_best_answer
 from bertserini.experiments.args import *
 from bertserini.retriever.pyserini_retriever import retriever, build_searcher
 
-do_english_test = True
+ENG_reader = "DPR"
 do_local_test = True
 do_bm25_test = True
 do_dpr_test = True
-do_chinese_test = True
+do_chinese_test = False
 
-if do_english_test:
+if ENG_reader == "BERT":
     args.model_name_or_path = "rsvp-ai/bertserini-bert-base-squad"
     args.tokenizer_name = "rsvp-ai/bertserini-bert-base-squad"
     bert_reader = BERT(args)
-    print("Question: Why did Mark Twain call the 19th century the glied age?")
+
+elif ENG_reader == "DPR":
+    args.model_name_or_path = "facebook/dpr-reader-multiset-base"
+    args.tokenizer_name = "facebook/dpr-reader-multiset-base"
+    bert_reader = DPR(args)
+
+print("Question: Why did Mark Twain call the 19th century the glied age?")
 
 if do_local_test:
     print("######################### Testing Local Context #########################")

From 50758ec702460098a5a45f5809a74ff8023788e6 Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Tue, 22 Mar 2022 01:55:38 +0000
Subject: [PATCH 39/50] clean up code

---
 .gitignore                                 | 1 -
 bertserini/experiments/args.py             | 1 -
 bertserini/experiments/eval/evaluate_v1.py | 7 +++----
 bertserini/reader/bert_reader.py           | 6 ++----
 bertserini/utils/utils_squad_metrics.py    | 1 -
 5 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index 971a53d..1c74ceb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,5 +9,4 @@ build/
 dist/
 
 bertserini.egg-info/
-
 indexes/
diff --git a/bertserini/experiments/args.py b/bertserini/experiments/args.py
index a119e12..828f773 100644
--- a/bertserini/experiments/args.py
+++ b/bertserini/experiments/args.py
@@ -76,7 +76,6 @@
 )
 parser.add_argument(
     "--output_nbest_file",
-    default="./tmp.nbest", 
     default=None, 
     type=str,
     help="The output file for store nbest results temporarily",
diff --git a/bertserini/experiments/eval/evaluate_v1.py b/bertserini/experiments/eval/evaluate_v1.py
index e9ef60a..8c32312 100755
--- a/bertserini/experiments/eval/evaluate_v1.py
+++ b/bertserini/experiments/eval/evaluate_v1.py
@@ -136,14 +136,13 @@ def squad_v1_eval(dataset_filename, prediction_filename):
     expected_version = '1.1'
     with open(dataset_filename) as dataset_file:
         dataset_json = json.load(dataset_file)
-        #if dataset_json['version'] != expected_version:
-        #    logger.error('Evaluation expects v-{}, but got dataset with v-{}'.format(
-        #        expected_version, dataset_json['version']))
+        if dataset_json['version'] != expected_version:
+            logger.error('Evaluation expects v-{}, but got dataset with v-{}'.format(
+                expected_version, dataset_json['version']))
         dataset = dataset_json['data']
     with open(prediction_filename) as prediction_file:
         predictions = json.load(prediction_file)
     ans = evaluate(dataset, predictions)
-    # print(json.dumps(ans))
     return ans
 
 
diff --git a/bertserini/reader/bert_reader.py b/bertserini/reader/bert_reader.py
index 053496a..43224d4 100644
--- a/bertserini/reader/bert_reader.py
+++ b/bertserini/reader/bert_reader.py
@@ -1,5 +1,4 @@
 from typing import List
-import json
 
 from transformers import AutoTokenizer, AutoModelForQuestionAnswering, SquadExample
 from torch.utils.data import DataLoader, SequentialSampler
@@ -26,7 +25,6 @@ def craft_squad_examples(question: Question, contexts: List[Context]) -> List[Sq
                 title="",
                 is_impossible=False,
                 answers=[],
-                #language=ctx.language
             )
         )
     return examples
@@ -99,7 +97,6 @@ def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
                 unique_id = int(eval_feature.unique_id)
 
                 output = [outputs[oname][i] for oname in outputs]
-                
                 start_logits = outputs.start_logits[i]
                 end_logits = outputs.end_logits[i]
                 try:
@@ -107,6 +104,7 @@ def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
                     end_logits = end_logits.item()
                 except:
                     pass
+
                 result = SquadResult(unique_id, start_logits, end_logits)
 
                 all_results.append(result)
@@ -125,8 +123,8 @@ def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
             version_2_with_negative=self.args["version_2_with_negative"],
             null_score_diff_threshold=self.args["null_score_diff_threshold"],
             tokenizer=self.tokenizer,
+            language=questions.language
         )
-        #nbest = json.load(open(self.args["output_nbest_file"],'r'))
 
         all_answers = []
         for idx, ans in enumerate(nbest):
diff --git a/bertserini/utils/utils_squad_metrics.py b/bertserini/utils/utils_squad_metrics.py
index b2001da..e3b5716 100644
--- a/bertserini/utils/utils_squad_metrics.py
+++ b/bertserini/utils/utils_squad_metrics.py
@@ -33,7 +33,6 @@
 
 logger = logging.get_logger(__name__)
 
-
 def normalize_answer(s):
     """Lower text and remove punctuation, articles and extra whitespace."""
 

From f11f6382788a312fe0dee7f0f13db0bf108e9f36 Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Tue, 22 Mar 2022 01:58:25 +0000
Subject: [PATCH 40/50] fix typo and remove un-used file

---
 bertserini/reader/bert_reader.py    |   2 +-
 bertserini/utils/utils_squad_new.py | 777 ----------------------------
 2 files changed, 1 insertion(+), 778 deletions(-)
 delete mode 100644 bertserini/utils/utils_squad_new.py

diff --git a/bertserini/reader/bert_reader.py b/bertserini/reader/bert_reader.py
index 43224d4..3e04c9c 100644
--- a/bertserini/reader/bert_reader.py
+++ b/bertserini/reader/bert_reader.py
@@ -123,7 +123,7 @@ def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
             version_2_with_negative=self.args["version_2_with_negative"],
             null_score_diff_threshold=self.args["null_score_diff_threshold"],
             tokenizer=self.tokenizer,
-            language=questions.language
+            language=question.language
         )
 
         all_answers = []
diff --git a/bertserini/utils/utils_squad_new.py b/bertserini/utils/utils_squad_new.py
deleted file mode 100644
index 19346fa..0000000
--- a/bertserini/utils/utils_squad_new.py
+++ /dev/null
@@ -1,777 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was modified by XLNet authors to
-update `find_best_threshold` scripts for SQuAD V2.0
-
-In addition to basic functionality, we also compute additional statistics and plot precision-recall curves if an
-additional na_prob.json file is provided. This file is expected to map question ID's to the model's predicted
-probability that a question is unanswerable.
-"""
-
-
-import collections
-import json
-import math
-import re
-import string
-
-#from ...models.bert import BasicTokenizer
-#from ...utils import logging
-from transformers import AutoTokenizer
-
-
-#logger = logging.get_logger(__name__)
-
-
-def normalize_answer(s):
-    """Lower text and remove punctuation, articles and extra whitespace."""
-
-    def remove_articles(text):
-        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
-        return re.sub(regex, " ", text)
-
-    def white_space_fix(text):
-        return " ".join(text.split())
-
-    def remove_punc(text):
-        exclude = set(string.punctuation)
-        return "".join(ch for ch in text if ch not in exclude)
-
-    def lower(text):
-        return text.lower()
-
-    return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-
-def get_tokens(s):
-    if not s:
-        return []
-    return normalize_answer(s).split()
-
-
-def compute_exact(a_gold, a_pred):
-    return int(normalize_answer(a_gold) == normalize_answer(a_pred))
-
-
-def compute_f1(a_gold, a_pred):
-    gold_toks = get_tokens(a_gold)
-    pred_toks = get_tokens(a_pred)
-    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
-    num_same = sum(common.values())
-    if len(gold_toks) == 0 or len(pred_toks) == 0:
-        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
-        return int(gold_toks == pred_toks)
-    if num_same == 0:
-        return 0
-    precision = 1.0 * num_same / len(pred_toks)
-    recall = 1.0 * num_same / len(gold_toks)
-    f1 = (2 * precision * recall) / (precision + recall)
-    return f1
-
-
-def get_raw_scores(examples, preds):
-    """
-    Computes the exact and f1 scores from the examples and the model predictions
-    """
-    exact_scores = {}
-    f1_scores = {}
-
-    for example in examples:
-        qas_id = example.qas_id
-        gold_answers = [answer["text"] for answer in example.answers if normalize_answer(answer["text"])]
-
-        if not gold_answers:
-            # For unanswerable questions, only correct answer is empty string
-            gold_answers = [""]
-
-        if qas_id not in preds:
-            print(f"Missing prediction for {qas_id}")
-            continue
-
-        prediction = preds[qas_id]
-        exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers)
-        f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers)
-
-    return exact_scores, f1_scores
-
-
-def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
-    new_scores = {}
-    for qid, s in scores.items():
-        pred_na = na_probs[qid] > na_prob_thresh
-        if pred_na:
-            new_scores[qid] = float(not qid_to_has_ans[qid])
-        else:
-            new_scores[qid] = s
-    return new_scores
-
-
-def make_eval_dict(exact_scores, f1_scores, qid_list=None):
-    if not qid_list:
-        total = len(exact_scores)
-        return collections.OrderedDict(
-            [
-                ("exact", 100.0 * sum(exact_scores.values()) / total),
-                ("f1", 100.0 * sum(f1_scores.values()) / total),
-                ("total", total),
-            ]
-        )
-    else:
-        total = len(qid_list)
-        return collections.OrderedDict(
-            [
-                ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total),
-                ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total),
-                ("total", total),
-            ]
-        )
-
-
-def merge_eval(main_eval, new_eval, prefix):
-    for k in new_eval:
-        main_eval[f"{prefix}_{k}"] = new_eval[k]
-
-
-def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
-    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
-    cur_score = num_no_ans
-    best_score = cur_score
-    best_thresh = 0.0
-    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
-    for i, qid in enumerate(qid_list):
-        if qid not in scores:
-            continue
-        if qid_to_has_ans[qid]:
-            diff = scores[qid]
-        else:
-            if preds[qid]:
-                diff = -1
-            else:
-                diff = 0
-        cur_score += diff
-        if cur_score > best_score:
-            best_score = cur_score
-            best_thresh = na_probs[qid]
-
-    has_ans_score, has_ans_cnt = 0, 0
-    for qid in qid_list:
-        if not qid_to_has_ans[qid]:
-            continue
-        has_ans_cnt += 1
-
-        if qid not in scores:
-            continue
-        has_ans_score += scores[qid]
-
-    return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
-
-
-def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
-    best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
-    best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
-    main_eval["best_exact"] = best_exact
-    main_eval["best_exact_thresh"] = exact_thresh
-    main_eval["best_f1"] = best_f1
-    main_eval["best_f1_thresh"] = f1_thresh
-    main_eval["has_ans_exact"] = has_ans_exact
-    main_eval["has_ans_f1"] = has_ans_f1
-
-
-def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
-    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
-    cur_score = num_no_ans
-    best_score = cur_score
-    best_thresh = 0.0
-    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
-    for _, qid in enumerate(qid_list):
-        if qid not in scores:
-            continue
-        if qid_to_has_ans[qid]:
-            diff = scores[qid]
-        else:
-            if preds[qid]:
-                diff = -1
-            else:
-                diff = 0
-        cur_score += diff
-        if cur_score > best_score:
-            best_score = cur_score
-            best_thresh = na_probs[qid]
-    return 100.0 * best_score / len(scores), best_thresh
-
-
-def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
-    best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
-    best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
-
-    main_eval["best_exact"] = best_exact
-    main_eval["best_exact_thresh"] = exact_thresh
-    main_eval["best_f1"] = best_f1
-    main_eval["best_f1_thresh"] = f1_thresh
-
-
-def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0):
-    qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples}
-    has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer]
-    no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]
-
-    if no_answer_probs is None:
-        no_answer_probs = {k: 0.0 for k in preds}
-
-    exact, f1 = get_raw_scores(examples, preds)
-
-    exact_threshold = apply_no_ans_threshold(
-        exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold
-    )
-    f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
-
-    evaluation = make_eval_dict(exact_threshold, f1_threshold)
-
-    if has_answer_qids:
-        has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids)
-        merge_eval(evaluation, has_ans_eval, "HasAns")
-
-    if no_answer_qids:
-        no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids)
-        merge_eval(evaluation, no_ans_eval, "NoAns")
-
-    if no_answer_probs:
-        find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer)
-
-    return evaluation
-
-
-def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
-    """Project the tokenized prediction back to the original text."""
-
-    # When we created the data, we kept track of the alignment between original
-    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
-    # now `orig_text` contains the span of our original text corresponding to the
-    # span that we predicted.
-    #
-    # However, `orig_text` may contain extra characters that we don't want in
-    # our prediction.
-    #
-    # For example, let's say:
-    #   pred_text = steve smith
-    #   orig_text = Steve Smith's
-    #
-    # We don't want to return `orig_text` because it contains the extra "'s".
-    #
-    # We don't want to return `pred_text` because it's already been normalized
-    # (the SQuAD eval script also does punctuation stripping/lower casing but
-    # our tokenizer does additional normalization like stripping accent
-    # characters).
-    #
-    # What we really want to return is "Steve Smith".
-    #
-    # Therefore, we have to apply a semi-complicated alignment heuristic between
-    # `pred_text` and `orig_text` to get a character-to-character alignment. This
-    # can fail in certain cases in which case we just return `orig_text`.
-
-    def _strip_spaces(text):
-        ns_chars = []
-        ns_to_s_map = collections.OrderedDict()
-        for (i, c) in enumerate(text):
-            if c == " ":
-                continue
-            ns_to_s_map[len(ns_chars)] = i
-            ns_chars.append(c)
-        ns_text = "".join(ns_chars)
-        return (ns_text, ns_to_s_map)
-
-    # We first tokenize `orig_text`, strip whitespace from the result
-    # and `pred_text`, and check if they are the same length. If they are
-    # NOT the same length, the heuristic has failed. If they are the same
-    # length, we assume the characters are one-to-one aligned.
-    #tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
-    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="rsvp-ai/bertserini-bert-base-squad", use_fast=False)
-    tok_text = " ".join(tokenizer.tokenize(orig_text))
-
-    start_position = tok_text.find(pred_text)
-    if start_position == -1:
-        #if verbose_logging:
-            #logger.info(f"Unable to find text: '{pred_text}' in '{orig_text}'")
-        return orig_text
-    end_position = start_position + len(pred_text) - 1
-
-    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
-    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
-
-    if len(orig_ns_text) != len(tok_ns_text):
-        #if verbose_logging:
-            #logger.info(f"Length not equal after stripping spaces: '{orig_ns_text}' vs '{tok_ns_text}'")
-        return orig_text
-
-    # We then project the characters in `pred_text` back to `orig_text` using
-    # the character-to-character alignment.
-    tok_s_to_ns_map = {}
-    for (i, tok_index) in tok_ns_to_s_map.items():
-        tok_s_to_ns_map[tok_index] = i
-
-    orig_start_position = None
-    if start_position in tok_s_to_ns_map:
-        ns_start_position = tok_s_to_ns_map[start_position]
-        if ns_start_position in orig_ns_to_s_map:
-            orig_start_position = orig_ns_to_s_map[ns_start_position]
-
-    if orig_start_position is None:
-        #if verbose_logging:
-        #    logger.info("Couldn't map start position")
-        return orig_text
-
-    orig_end_position = None
-    if end_position in tok_s_to_ns_map:
-        ns_end_position = tok_s_to_ns_map[end_position]
-        if ns_end_position in orig_ns_to_s_map:
-            orig_end_position = orig_ns_to_s_map[ns_end_position]
-
-    if orig_end_position is None:
-        #if verbose_logging:
-        #    logger.info("Couldn't map end position")
-        return orig_text
-
-    output_text = orig_text[orig_start_position : (orig_end_position + 1)]
-    return output_text
-
-
-def _get_best_indexes(logits, n_best_size):
-    """Get the n-best logits from a list."""
-    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
-
-    best_indexes = []
-    for i in range(len(index_and_score)):
-        if i >= n_best_size:
-            break
-        best_indexes.append(index_and_score[i][0])
-    return best_indexes
-
-
-def _compute_softmax(scores):
-    """Compute softmax probability over raw logits."""
-    if not scores:
-        return []
-
-    max_score = None
-    for score in scores:
-        if max_score is None or score > max_score:
-            max_score = score
-
-    exp_scores = []
-    total_sum = 0.0
-    for score in scores:
-        x = math.exp(score - max_score)
-        exp_scores.append(x)
-        total_sum += x
-
-    probs = []
-    for score in exp_scores:
-        probs.append(score / total_sum)
-    return probs
-
-
-def compute_predictions_logits(
-    all_examples,
-    all_features,
-    all_results,
-    n_best_size,
-    max_answer_length,
-    do_lower_case,
-    output_prediction_file,
-    output_nbest_file,
-    output_null_log_odds_file,
-    verbose_logging,
-    version_2_with_negative,
-    null_score_diff_threshold,
-    tokenizer,
-):
-    """Write final predictions to the json file and log-odds of null if needed."""
-    #if output_prediction_file:
-    #    logger.info(f"Writing predictions to: {output_prediction_file}")
-    #if output_nbest_file:
-    #    logger.info(f"Writing nbest to: {output_nbest_file}")
-    #if output_null_log_odds_file and version_2_with_negative:
-    #    logger.info(f"Writing null_log_odds to: {output_null_log_odds_file}")
-
-    example_index_to_features = collections.defaultdict(list)
-    for feature in all_features:
-        example_index_to_features[feature.example_index].append(feature)
-
-    unique_id_to_result = {}
-    for result in all_results:
-        unique_id_to_result[result.unique_id] = result
-
-    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
-    )
-
-    all_predictions = collections.OrderedDict()
-    all_nbest_json = collections.OrderedDict()
-    scores_diff_json = collections.OrderedDict()
-
-    for (example_index, example) in enumerate(all_examples):
-        features = example_index_to_features[example_index]
-
-        prelim_predictions = []
-        # keep track of the minimum score of null start+end of position 0
-        score_null = 1000000  # large and positive
-        min_null_feature_index = 0  # the paragraph slice with min null score
-        null_start_logit = 0  # the start logit at the slice with min null score
-        null_end_logit = 0  # the end logit at the slice with min null score
-        for (feature_index, feature) in enumerate(features):
-            result = unique_id_to_result[feature.unique_id]
-            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
-            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
-            # if we could have irrelevant answers, get the min score of irrelevant
-            if version_2_with_negative:
-                feature_null_score = result.start_logits[0] + result.end_logits[0]
-                if feature_null_score < score_null:
-                    score_null = feature_null_score
-                    min_null_feature_index = feature_index
-                    null_start_logit = result.start_logits[0]
-                    null_end_logit = result.end_logits[0]
-            for start_index in start_indexes:
-                for end_index in end_indexes:
-                    # We could hypothetically create invalid predictions, e.g., predict
-                    # that the start of the span is in the question. We throw out all
-                    # invalid predictions.
-                    if start_index >= len(feature.tokens):
-                        continue
-                    if end_index >= len(feature.tokens):
-                        continue
-                    if start_index not in feature.token_to_orig_map:
-                        continue
-                    if end_index not in feature.token_to_orig_map:
-                        continue
-                    if not feature.token_is_max_context.get(start_index, False):
-                        continue
-                    if end_index < start_index:
-                        continue
-                    length = end_index - start_index + 1
-                    if length > max_answer_length:
-                        continue
-                    prelim_predictions.append(
-                        _PrelimPrediction(
-                            feature_index=feature_index,
-                            start_index=start_index,
-                            end_index=end_index,
-                            start_logit=result.start_logits[start_index],
-                            end_logit=result.end_logits[end_index],
-                        )
-                    )
-        if version_2_with_negative:
-            prelim_predictions.append(
-                _PrelimPrediction(
-                    feature_index=min_null_feature_index,
-                    start_index=0,
-                    end_index=0,
-                    start_logit=null_start_logit,
-                    end_logit=null_end_logit,
-                )
-            )
-        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
-
-        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-            "NbestPrediction", ["text", "start_logit", "end_logit"]
-        )
-
-        seen_predictions = {}
-        nbest = []
-        for pred in prelim_predictions:
-            if len(nbest) >= n_best_size:
-                break
-            feature = features[pred.feature_index]
-            if pred.start_index > 0:  # this is a non-null prediction
-                tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
-                orig_doc_start = feature.token_to_orig_map[pred.start_index]
-                orig_doc_end = feature.token_to_orig_map[pred.end_index]
-                orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
-
-                tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
-
-                # tok_text = " ".join(tok_tokens)
-                #
-                # # De-tokenize WordPieces that have been split off.
-                # tok_text = tok_text.replace(" ##", "")
-                # tok_text = tok_text.replace("##", "")
-
-                # Clean whitespace
-                tok_text = tok_text.strip()
-                tok_text = " ".join(tok_text.split())
-                orig_text = " ".join(orig_tokens)
-
-                final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
-                if final_text in seen_predictions:
-                    continue
-
-                seen_predictions[final_text] = True
-            else:
-                final_text = ""
-                seen_predictions[final_text] = True
-
-            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
-        # if we didn't include the empty option in the n-best, include it
-        if version_2_with_negative:
-            if "" not in seen_predictions:
-                nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
-
-            # In very rare edge cases we could only have single null prediction.
-            # So we just create a nonce prediction in this case to avoid failure.
-            if len(nbest) == 1:
-                nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-
-        # In very rare edge cases we could have no valid predictions. So we
-        # just create a nonce prediction in this case to avoid failure.
-        if not nbest:
-            nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-
-        assert len(nbest) >= 1, "No valid predictions"
-
-        total_scores = []
-        best_non_null_entry = None
-        for entry in nbest:
-            total_scores.append(entry.start_logit + entry.end_logit)
-            if not best_non_null_entry:
-                if entry.text:
-                    best_non_null_entry = entry
-
-        probs = _compute_softmax(total_scores)
-
-        nbest_json = []
-        for (i, entry) in enumerate(nbest):
-            output = collections.OrderedDict()
-            output["text"] = entry.text
-            output["probability"] = probs[i]
-            output["start_logit"] = entry.start_logit
-            output["end_logit"] = entry.end_logit
-            nbest_json.append(output)
-
-        assert len(nbest_json) >= 1, "No valid predictions"
-
-        if not version_2_with_negative:
-            all_predictions[example.qas_id] = nbest_json[0]["text"]
-        else:
-            # predict "" iff the null score - the score of best non-null > threshold
-            score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
-            scores_diff_json[example.qas_id] = score_diff
-            if score_diff > null_score_diff_threshold:
-                all_predictions[example.qas_id] = ""
-            else:
-                all_predictions[example.qas_id] = best_non_null_entry.text
-        all_nbest_json[example.qas_id] = nbest_json
-
-    if output_prediction_file:
-        with open(output_prediction_file, "w") as writer:
-            writer.write(json.dumps(all_predictions, indent=4) + "\n")
-
-    if output_nbest_file:
-        with open(output_nbest_file, "w") as writer:
-            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
-
-    if output_null_log_odds_file and version_2_with_negative:
-        with open(output_null_log_odds_file, "w") as writer:
-            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
-
-    return all_nbest_json
-
-
-def compute_predictions_log_probs(
-    all_examples,
-    all_features,
-    all_results,
-    n_best_size,
-    max_answer_length,
-    output_prediction_file,
-    output_nbest_file,
-    output_null_log_odds_file,
-    start_n_top,
-    end_n_top,
-    version_2_with_negative,
-    tokenizer,
-    verbose_logging,
-):
-    """
-    XLNet write prediction logic (more complex than Bert's). Write final predictions to the json file and log-odds of
-    null if needed.
-
-    Requires utils_squad_evaluate.py
-    """
-    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]
-    )
-
-    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]
-    )
-
-    #logger.info(f"Writing predictions to: {output_prediction_file}")
-
-    example_index_to_features = collections.defaultdict(list)
-    for feature in all_features:
-        example_index_to_features[feature.example_index].append(feature)
-
-    unique_id_to_result = {}
-    for result in all_results:
-        unique_id_to_result[result.unique_id] = result
-
-    all_predictions = collections.OrderedDict()
-    all_nbest_json = collections.OrderedDict()
-    scores_diff_json = collections.OrderedDict()
-
-    for (example_index, example) in enumerate(all_examples):
-        features = example_index_to_features[example_index]
-
-        prelim_predictions = []
-        # keep track of the minimum score of null start+end of position 0
-        score_null = 1000000  # large and positive
-
-        for (feature_index, feature) in enumerate(features):
-            result = unique_id_to_result[feature.unique_id]
-
-            cur_null_score = result.cls_logits
-
-            # if we could have irrelevant answers, get the min score of irrelevant
-            score_null = min(score_null, cur_null_score)
-
-            for i in range(start_n_top):
-                for j in range(end_n_top):
-                    start_log_prob = result.start_logits[i]
-                    start_index = result.start_top_index[i]
-
-                    j_index = i * end_n_top + j
-
-                    end_log_prob = result.end_logits[j_index]
-                    end_index = result.end_top_index[j_index]
-
-                    # We could hypothetically create invalid predictions, e.g., predict
-                    # that the start of the span is in the question. We throw out all
-                    # invalid predictions.
-                    if start_index >= feature.paragraph_len - 1:
-                        continue
-                    if end_index >= feature.paragraph_len - 1:
-                        continue
-
-                    if not feature.token_is_max_context.get(start_index, False):
-                        continue
-                    if end_index < start_index:
-                        continue
-                    length = end_index - start_index + 1
-                    if length > max_answer_length:
-                        continue
-
-                    prelim_predictions.append(
-                        _PrelimPrediction(
-                            feature_index=feature_index,
-                            start_index=start_index,
-                            end_index=end_index,
-                            start_log_prob=start_log_prob,
-                            end_log_prob=end_log_prob,
-                        )
-                    )
-
-        prelim_predictions = sorted(
-            prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), reverse=True
-        )
-
-        seen_predictions = {}
-        nbest = []
-        for pred in prelim_predictions:
-            if len(nbest) >= n_best_size:
-                break
-            feature = features[pred.feature_index]
-
-            # XLNet un-tokenizer
-            # Let's keep it simple for now and see if we need all this later.
-            #
-            # tok_start_to_orig_index = feature.tok_start_to_orig_index
-            # tok_end_to_orig_index = feature.tok_end_to_orig_index
-            # start_orig_pos = tok_start_to_orig_index[pred.start_index]
-            # end_orig_pos = tok_end_to_orig_index[pred.end_index]
-            # paragraph_text = example.paragraph_text
-            # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
-
-            # Previously used Bert untokenizer
-            tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
-            orig_doc_start = feature.token_to_orig_map[pred.start_index]
-            orig_doc_end = feature.token_to_orig_map[pred.end_index]
-            orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
-            tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
-
-            # Clean whitespace
-            tok_text = tok_text.strip()
-            tok_text = " ".join(tok_text.split())
-            orig_text = " ".join(orig_tokens)
-
-            if hasattr(tokenizer, "do_lower_case"):
-                do_lower_case = tokenizer.do_lower_case
-            else:
-                do_lower_case = tokenizer.do_lowercase_and_remove_accent
-
-            final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
-
-            if final_text in seen_predictions:
-                continue
-
-            seen_predictions[final_text] = True
-
-            nbest.append(
-                _NbestPrediction(text=final_text, start_log_prob=pred.start_log_prob, end_log_prob=pred.end_log_prob)
-            )
-
-        # In very rare edge cases we could have no valid predictions. So we
-        # just create a nonce prediction in this case to avoid failure.
-        if not nbest:
-            nbest.append(_NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6))
-
-        total_scores = []
-        best_non_null_entry = None
-        for entry in nbest:
-            total_scores.append(entry.start_log_prob + entry.end_log_prob)
-            if not best_non_null_entry:
-                best_non_null_entry = entry
-
-        probs = _compute_softmax(total_scores)
-
-        nbest_json = []
-        for (i, entry) in enumerate(nbest):
-            output = collections.OrderedDict()
-            output["text"] = entry.text
-            output["probability"] = probs[i]
-            output["start_log_prob"] = entry.start_log_prob
-            output["end_log_prob"] = entry.end_log_prob
-            nbest_json.append(output)
-
-        assert len(nbest_json) >= 1, "No valid predictions"
-        assert best_non_null_entry is not None, "No valid predictions"
-
-        score_diff = score_null
-        scores_diff_json[example.qas_id] = score_diff
-        # note(zhiliny): always predict best_non_null_entry
-        # and the evaluation script will search for the best threshold
-        all_predictions[example.qas_id] = best_non_null_entry.text
-
-        all_nbest_json[example.qas_id] = nbest_json
-
-    with open(output_prediction_file, "w") as writer:
-        writer.write(json.dumps(all_predictions, indent=4) + "\n")
-
-    with open(output_nbest_file, "w") as writer:
-        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
-
-    if version_2_with_negative:
-        with open(output_null_log_odds_file, "w") as writer:
-            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
-
-    return all_predictions

From 71ae035f957254546062d821b7b6b5afbb77fe4d Mon Sep 17 00:00:00 2001
From: Yuqing Xie <amyxie361@outlook.com>
Date: Mon, 21 Mar 2022 22:00:38 -0400
Subject: [PATCH 41/50] clean up  (#29)

Clean up code
---
 .gitignore                                 |  2 ++
 bertserini/experiments/eval/evaluate_v1.py |  4 +---
 bertserini/experiments/inference.py        |  1 -
 bertserini/reader/bert_reader.py           |  3 +--
 bertserini/retriever/pyserini_retriever.py |  1 -
 bertserini/utils/utils_squad_metrics.py    | 10 ++--------
 6 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/.gitignore b/.gitignore
index 06d575f..1c74ceb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 cache*
 *.log
 *.json
+*.sh
 
 .idea/
 
@@ -8,3 +9,4 @@ build/
 dist/
 
 bertserini.egg-info/
+indexes/
diff --git a/bertserini/experiments/eval/evaluate_v1.py b/bertserini/experiments/eval/evaluate_v1.py
index b853597..8c32312 100755
--- a/bertserini/experiments/eval/evaluate_v1.py
+++ b/bertserini/experiments/eval/evaluate_v1.py
@@ -98,8 +98,7 @@ def evaluate(dataset, predictions):
             for qa in paragraph['qas']:
                 total += 1
                 if qa['id'] not in predictions:
-                    message = 'Unanswered question ' + qa['id'] + \
-                              ' will receive score 0.'
+                    message = 'Unanswered question ' + str(qa['id']) + ' will receive score 0.'
                     logger.error(message)
                     continue
                 ground_truths = list(map(lambda x: x['text'], qa['answers']))
@@ -144,7 +143,6 @@ def squad_v1_eval(dataset_filename, prediction_filename):
     with open(prediction_filename) as prediction_file:
         predictions = json.load(prediction_file)
     ans = evaluate(dataset, predictions)
-    # print(json.dumps(ans))
     return ans
 
 
diff --git a/bertserini/experiments/inference.py b/bertserini/experiments/inference.py
index a64e5af..a542750 100644
--- a/bertserini/experiments/inference.py
+++ b/bertserini/experiments/inference.py
@@ -7,7 +7,6 @@
 
 if __name__ == "__main__":
     questions = extract_squad_questions(args.dataset_path)
-    #bert_reader = BERT(args.model_name_or_path, args.tokenizer_name)
     bert_reader = BERT(args)
     searcher = build_searcher(args)
 
diff --git a/bertserini/reader/bert_reader.py b/bertserini/reader/bert_reader.py
index e0f132f..3e04c9c 100644
--- a/bertserini/reader/bert_reader.py
+++ b/bertserini/reader/bert_reader.py
@@ -12,7 +12,6 @@
 
 from bertserini.train.run_squad import to_list
 
-
 def craft_squad_examples(question: Question, contexts: List[Context]) -> List[SquadExample]:
     examples = []
     for idx, ctx in enumerate(contexts):
@@ -105,7 +104,7 @@ def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
                     end_logits = end_logits.item()
                 except:
                     pass
-                
+
                 result = SquadResult(unique_id, start_logits, end_logits)
 
                 all_results.append(result)
diff --git a/bertserini/retriever/pyserini_retriever.py b/bertserini/retriever/pyserini_retriever.py
index 16aefea..f11a573 100644
--- a/bertserini/retriever/pyserini_retriever.py
+++ b/bertserini/retriever/pyserini_retriever.py
@@ -84,7 +84,6 @@ def hits_to_contexts(hits: List[JLuceneSearcherResult], language="en", field='ra
         for s in blacklist:
             if s in t:
                 continue
-        #metadata = {'raw': hits.raw, 'docid': hits.docid}
         metadata = {}
         contexts.append(Context(t, language, metadata, score))
     return contexts
diff --git a/bertserini/utils/utils_squad_metrics.py b/bertserini/utils/utils_squad_metrics.py
index 2039836..e3b5716 100644
--- a/bertserini/utils/utils_squad_metrics.py
+++ b/bertserini/utils/utils_squad_metrics.py
@@ -27,16 +27,12 @@
 import re
 import string
 
-#from ...models.bert import BasicTokenizer
-#from ...utils import logging
-#from transformers.models.bert import BasicTokenizer
-from transformers import AutoTokenizer
 from transformers.utils import logging
+from transformers import AutoTokenizer
 
 
 logger = logging.get_logger(__name__)
 
-
 def normalize_answer(s):
     """Lower text and remove punctuation, articles and extra whitespace."""
 
@@ -298,9 +294,8 @@ def _strip_spaces(text):
     # and `pred_text`, and check if they are the same length. If they are
     # NOT the same length, the heuristic has failed. If they are the same
     # length, we assume the characters are one-to-one aligned.
-    #tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=False)
 
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=False)
     if language=="zh":
         tok_text = "".join(tokenizer.tokenize(orig_text))
     else:
@@ -561,7 +556,6 @@ def compute_predictions_logits(
                 if entry.text:
                     best_non_null_entry = entry
 
-        #probs = _compute_softmax(total_scores)
         probs = total_scores
 
         nbest_json = []

From 0b86c86820649c88234b286bc6cd4c9e9f665aaf Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Tue, 22 Mar 2022 03:43:42 +0000
Subject: [PATCH 42/50] fix minor bug

---
 bertserini/reader/bert_reader.py           |  1 -
 bertserini/retriever/pyserini_retriever.py |  2 +-
 test.py => inference_test.py               | 21 ++++++++++++---------
 3 files changed, 13 insertions(+), 11 deletions(-)
 rename test.py => inference_test.py (79%)

diff --git a/bertserini/reader/bert_reader.py b/bertserini/reader/bert_reader.py
index 875bf53..b0256ce 100644
--- a/bertserini/reader/bert_reader.py
+++ b/bertserini/reader/bert_reader.py
@@ -96,7 +96,6 @@ def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
                 eval_feature = features[feature_index.item()]
                 unique_id = int(eval_feature.unique_id)
 
-                output = [to_list(output[i]) for output in outputs]
                 output = [outputs[oname][i] for oname in outputs]
                 start_logits = outputs.start_logits[i]
                 end_logits = outputs.end_logits[i]
diff --git a/bertserini/retriever/pyserini_retriever.py b/bertserini/retriever/pyserini_retriever.py
index bce4723..fdd283e 100644
--- a/bertserini/retriever/pyserini_retriever.py
+++ b/bertserini/retriever/pyserini_retriever.py
@@ -42,7 +42,7 @@ def retriever(question, searcher, para_num=20):
         results = searcher.search(question.text, k=para_num)
         hits = []
         for r in results:
-            hit = searcher.doc(r.docid).get("raw") 
+            hit = searcher.doc(r.docid).get("raw")
             hits.append((hit, r.score))
     else:
         try:
diff --git a/test.py b/inference_test.py
similarity index 79%
rename from test.py
rename to inference_test.py
index 6098f13..69bf0f7 100644
--- a/test.py
+++ b/inference_test.py
@@ -5,11 +5,11 @@
 from bertserini.experiments.args import *
 from bertserini.retriever.pyserini_retriever import retriever, build_searcher
 
-ENG_reader = "DPR"
+ENG_reader = "BERT"
 do_local_test = True
 do_bm25_test = True
 do_dpr_test = True
-do_chinese_test = False
+do_chinese_test = True
 
 if ENG_reader == "BERT":
     args.model_name_or_path = "rsvp-ai/bertserini-bert-base-squad"
@@ -21,25 +21,28 @@
     args.tokenizer_name = "facebook/dpr-reader-multiset-base"
     bert_reader = DPR(args)
 
-print("Question: Why did Mark Twain call the 19th century the glied age?")
+question = Question("Why did Mark Twain call the 19th century the glied age?")
+print(question.text)
 
 if do_local_test:
     print("######################### Testing Local Context #########################")
-    question = Question("Why did Mark Twain call the 19th century the glied age?")
     contexts = [Context('The "Gilded Age" was a term that Mark Twain used to describe the period of the late 19th century when there had been a dramatic expansion of American wealth and prosperity.')]
     candidates = bert_reader.predict(question, contexts)
     answer = get_best_answer(candidates, 1.0)
     print("Answer:", answer.text)
-    print("Local Context Test Passed")
+    if answer.text == "there had been a dramatic expansion of American wealth and prosperity":
+        print("Local Context Test Passed")
+    else:
+        print("Wrong Answer")
 
 if do_bm25_test:
     print("######################### Testing BM25 Context #########################")
-    args.index_path = "/data/y247xie/01_exps/anserini/lucene-index.ik-nlp-22"
+    args.index_path = "./indexes/lucene-index.enwiki-20180701-paragraphs"
     searcher = build_searcher(args)
     contexts = retriever(question, searcher, 10)
     candidates = bert_reader.predict(question, contexts)
     answer = get_best_answer(candidates, 0.45)
-    print("Answer:", answer.text)
+    print("Answer:", answer.text) # todo: no context returned. is the context included? maybe update to another question
     print("BM25 Test Passed")
 
 if do_dpr_test:
@@ -47,9 +50,9 @@
     args.retriever = "dpr"
     args.encoder = "facebook/dpr-question_encoder-multiset-base"
     args.query_tokenizer_name = "facebook/dpr-question_encoder-multiset-base"
-    args.index_path = "/data/y247xie/01_exps/pyserini/dpr-ctx_encoder-multiset-base.ik-nlp-22_slp"
+    args.index_path = "../pyserini/dpr-ctx_encoder-multiset-base.ik-nlp-22_slp" # todo: replicate dpr on wiki and release dpr-indexes
     args.device = "cuda:0"
-    args.sparse_index = "/data/y247xie/01_exps/anserini/lucene-index.ik-nlp-22"
+    args.sparse_index = "../anserini/lucene-index.ik-nlp-22"
     searcher = build_searcher(args)
     contexts = retriever(question, searcher, 10)
     candidates = bert_reader.predict(question, contexts)

From ff6d802357ccda4d63aa5c4604a1ce8a2be71972 Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Wed, 30 Mar 2022 02:22:42 +0000
Subject: [PATCH 43/50] address comments'

---
 bertserini/experiments/eval/evaluate_v1_cmrc.py | 3 ---
 bertserini/experiments/evaluate.py              | 8 ++++----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/bertserini/experiments/eval/evaluate_v1_cmrc.py b/bertserini/experiments/eval/evaluate_v1_cmrc.py
index 4556d15..0f3ff32 100755
--- a/bertserini/experiments/eval/evaluate_v1_cmrc.py
+++ b/bertserini/experiments/eval/evaluate_v1_cmrc.py
@@ -17,9 +17,6 @@
 #from utils import init_logger
 #logger = init_logger("evaluation")
 
-#install punkt corpus
-nltk.download('punkt')
-
 # split Chinese with English
 def mixed_segmentation(in_str, rm_punc=False):
 	in_str = str(in_str).lower().strip()
diff --git a/bertserini/experiments/evaluate.py b/bertserini/experiments/evaluate.py
index 848edea..f79c7a4 100755
--- a/bertserini/experiments/evaluate.py
+++ b/bertserini/experiments/evaluate.py
@@ -55,17 +55,17 @@ def get_score_with_results(eval_data, predictions, mu, dataset):
     return eval_result, answers
 
 
-def get_best_mu_with_scores(eval_data, predictions, mu_range, dataset, output_path, standard="f1"): 
-    # standard = "f1" or "exact_match"
+def get_best_mu_with_scores(eval_data, predictions, mu_range, dataset, output_path, metric="f1"): 
+    # metric = "f1" or "exact_match"
     score_test = {}
     best_mu = 0
     best_score = 0
     for mu in mu_range:
         eval_result, answers = get_score_with_results(eval_data, predictions, mu, dataset)
         score_test[mu] = eval_result
-        if eval_result[standard] > best_score:
+        if eval_result[metric] > best_score:
             best_mu = mu
-            best_score = eval_result[standard]
+            best_score = eval_result[metric]
             json.dump(answers, open(output_path + "/prediction.json", 'w'))
 
     json.dump(score_test, open(output_path + "/score.json", 'w'))

From eac213b85ec1ee89f1621f073f3d3beb9bb384b2 Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Tue, 23 Aug 2022 16:48:27 +0000
Subject: [PATCH 44/50] update to fast-tokenize

---
 bertserini/experiments/eval/evaluate_v1.py | 53 ++++++++++++++++++++--
 bertserini/experiments/inference.py        |  4 ++
 bertserini/reader/bert_reader.py           |  4 +-
 bertserini/utils/utils_squad_metrics.py    | 12 +++--
 inference_test.py                          |  8 ++--
 5 files changed, 65 insertions(+), 16 deletions(-)

diff --git a/bertserini/experiments/eval/evaluate_v1.py b/bertserini/experiments/eval/evaluate_v1.py
index 8c32312..300cf9f 100755
--- a/bertserini/experiments/eval/evaluate_v1.py
+++ b/bertserini/experiments/eval/evaluate_v1.py
@@ -4,6 +4,12 @@
 import argparse
 import json
 
+from rouge_metric import PyRouge
+rouge = PyRouge(rouge_n=(2,), rouge_su=True, skip_gap=4)
+#from rouge_score import rouge_scorer
+#rouge1_scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
+#rougel_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
+
 from bertserini.utils.utils import normalize_answer, init_logger
 
 logger = init_logger("evluation")
@@ -67,6 +73,32 @@ def overlap_score(prediction, ground_truth):
 def exact_match_score(prediction, ground_truth):
     return normalize_answer(prediction) == normalize_answer(ground_truth)
 
+def rouge2_r_score(prediction, ground_truth):
+    if len(prediction) == 0:
+        return 0
+    return rouge.evaluate([ground_truth], [[prediction]])["rouge-2"]["r"]
+    #return rouge1_scorer.score(prediction, ground_truth)
+
+def rouge2_f_score(prediction, ground_truth):
+    if len(prediction) == 0:
+        return 0
+    return rouge.evaluate([ground_truth], [[prediction]])["rouge-2"]["f"]
+
+def rougesu4_r_score(prediction, ground_truth):
+    if len(prediction) == 0:
+        return 0
+    return rouge.evaluate([ground_truth], [[prediction]])["rouge-su4"]["r"]
+
+def rougesu4_f_score(prediction, ground_truth):
+    if len(prediction) == 0:
+        return 0
+    return rouge.evaluate([ground_truth], [[prediction]])["rouge-su4"]["f"]
+
+#def rougel_score(prediction, ground_truth):
+#    print(rougel_scorer.score(prediction, ground_truth))
+#    input()
+#    return rougel_scorer.score(prediction, ground_truth)
+
 
 def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
     scores_for_ground_truths = []
@@ -92,7 +124,7 @@ def metric_max_recall(metric_fn, prediction, ground_truths):
 
 
 def evaluate(dataset, predictions):
-    sentence_cover = precision = cover = sentence_recall = recall = f1 = exact_match = total = overlap = 0
+    sentence_cover = precision = cover = sentence_recall = recall = f1 = exact_match = total = overlap = rouge2_r = rouge2_f = rougesu4_r = rougesu4_f = 0
     for article in dataset:
         for paragraph in article['paragraphs']:
             for qa in paragraph['qas']:
@@ -104,6 +136,11 @@ def evaluate(dataset, predictions):
                 ground_truths = list(map(lambda x: x['text'], qa['answers']))
                 prediction = [predictions[qa['id']]]
                 #prediction_sentence = predictions[qa['id']]['sentences']
+                rouge2_r += metric_max_recall(rouge2_r_score,  prediction, ground_truths)
+                rouge2_f += metric_max_recall(rouge2_f_score,  prediction, ground_truths)
+                rougesu4_r += metric_max_recall(rougesu4_r_score,  prediction, ground_truths)
+                rougesu4_f += metric_max_recall(rougesu4_f_score,  prediction, ground_truths)
+                #rougel += metric_max_recall(rougel_score,  prediction, ground_truths)
                 cover += metric_max_recall(cover_score, prediction, ground_truths)
                 exact_match += metric_max_recall(
                     exact_match_score, prediction, ground_truths)
@@ -124,21 +161,27 @@ def evaluate(dataset, predictions):
     overlap = 100.0 * overlap / total
     cover = 100.0 * cover / total
     precision = 100.0 * precision / total
+    rouge2_r = 100.0 * rouge2_r / total
+    rouge2_f = 100.0 * rouge2_f / total
+    rougesu4_r = 100.0 * rougesu4_r / total
+    rougesu4_f = 100.0 * rougesu4_f / total
+    #rougel = 100.0 * rougel / total
     #sentence_recall = 100.0 * sentence_recall / total
     #sentence_cover = 100.0 * sentence_cover / total
 
     return {'exact_match': exact_match, 'f1': f1, "recall": recall, 
             #"sentence_recall": sentence_recall, "sentence_cover": sentence_cover,
-            "precision": precision, "cover": cover, "overlap": overlap}
+            "precision": precision, "cover": cover, "overlap": overlap, 
+            "rouge2_r": rouge2_r, "rouge2_f":rouge2_f, "rougesu4_r": rougesu4_r, "rougesu4_f": rougesu4_f}
 
 
 def squad_v1_eval(dataset_filename, prediction_filename):
     expected_version = '1.1'
     with open(dataset_filename) as dataset_file:
         dataset_json = json.load(dataset_file)
-        if dataset_json['version'] != expected_version:
-            logger.error('Evaluation expects v-{}, but got dataset with v-{}'.format(
-                expected_version, dataset_json['version']))
+        #if dataset_json['version'] != expected_version:
+        #    logger.error('Evaluation expects v-{}, but got dataset with v-{}'.format(
+        #        expected_version, dataset_json['version']))
         dataset = dataset_json['data']
     with open(prediction_filename) as prediction_file:
         predictions = json.load(prediction_file)
diff --git a/bertserini/experiments/inference.py b/bertserini/experiments/inference.py
index a204218..d6e7761 100644
--- a/bertserini/experiments/inference.py
+++ b/bertserini/experiments/inference.py
@@ -4,6 +4,7 @@
 from bertserini.retriever.pyserini_retriever import retriever, build_searcher
 from bertserini.utils.utils_new import extract_squad_questions
 from bertserini.experiments.args import *
+import time
 
 if __name__ == "__main__":
     questions = extract_squad_questions(args.dataset_path, do_strip_accents=args.strip_accents)
@@ -13,8 +14,11 @@
 
     all_answer = []
     for question in tqdm(questions):
+        print("before retriever:", time.time())
         contexts = retriever(question, searcher, args.topk)
+        print("before reader:", time.time())
         final_answers = bert_reader.predict(question, contexts)
+        print("after reader:", time.time())
         final_answers_lst = []
         for ans in final_answers:
             final_answers_lst.append(
diff --git a/bertserini/reader/bert_reader.py b/bertserini/reader/bert_reader.py
index b0256ce..eb210c3 100644
--- a/bertserini/reader/bert_reader.py
+++ b/bertserini/reader/bert_reader.py
@@ -37,7 +37,7 @@ def __init__(self, args):
             self.model_args.tokenizer_name = self.model_args.model_name_or_path
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.model = AutoModelForQuestionAnswering.from_pretrained(self.model_args.model_name_or_path).to(self.device).eval()
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_args.tokenizer_name, do_lower_case=True, use_fast=False)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_args.tokenizer_name, do_lower_case=True)
         self.args = {
             "max_seq_length": 384,
             "doc_stride": 128,
@@ -61,7 +61,6 @@ def update_args(self, args_to_change):
 
     def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
         examples = craft_squad_examples(question, contexts)
-
         features, dataset = squad_convert_examples_to_features(
             examples=examples,
             tokenizer=self.tokenizer,
@@ -79,7 +78,6 @@ def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
         eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.model_args.eval_batch_size)
 
         all_results = []
-
         for batch in eval_dataloader:
             self.model.eval()
             batch = tuple(t.to(self.device) for t in batch)
diff --git a/bertserini/utils/utils_squad_metrics.py b/bertserini/utils/utils_squad_metrics.py
index e3b5716..284af9e 100644
--- a/bertserini/utils/utils_squad_metrics.py
+++ b/bertserini/utils/utils_squad_metrics.py
@@ -251,7 +251,7 @@ def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_
     return evaluation
 
 
-def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False, language="en", tokenizer_name="rsvp-ai/bertserini-bert-base-squad"):
+def get_final_text(pred_text, orig_text, do_lower_case, tokenizer, verbose_logging=False, language="en", tokenizer_name="rsvp-ai/bertserini-bert-base-squad"):
     """Project the tokenized prediction back to the original text."""
 
     # When we created the data, we kept track of the alignment between original
@@ -295,7 +295,7 @@ def _strip_spaces(text):
     # NOT the same length, the heuristic has failed. If they are the same
     # length, we assume the characters are one-to-one aligned.
 
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=False)
+    #tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True)
     if language=="zh":
         tok_text = "".join(tokenizer.tokenize(orig_text))
     else:
@@ -422,6 +422,8 @@ def compute_predictions_logits(
     all_predictions = collections.OrderedDict()
     all_nbest_json = collections.OrderedDict()
     scores_diff_json = collections.OrderedDict()
+    import os
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
     for (example_index, example) in enumerate(all_examples):
         features = example_index_to_features[example_index]
@@ -491,7 +493,9 @@ def compute_predictions_logits(
 
         seen_predictions = {}
         nbest = []
+        c = 0
         for pred in prelim_predictions:
+            c += 1
             if len(nbest) >= n_best_size:
                 break
             feature = features[pred.feature_index]
@@ -518,7 +522,7 @@ def compute_predictions_logits(
                     tok_text = " ".join(tok_text.split())
                     orig_text = " ".join(orig_tokens)
 
-                final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging, language=language)
+                final_text = get_final_text(tok_text, orig_text, do_lower_case, tokenizer, verbose_logging=verbose_logging, language=language)
                 if "##" in final_text or "[UNK]" in final_text:
                     print(final_text, "||", tok_text, "||", orig_text)
 
@@ -736,7 +740,7 @@ def compute_predictions_log_probs(
             else:
                 do_lower_case = tokenizer.do_lowercase_and_remove_accent
 
-            final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
+            final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging, tokenizer)
 
             if final_text in seen_predictions:
                 continue
diff --git a/inference_test.py b/inference_test.py
index 69bf0f7..a871a11 100644
--- a/inference_test.py
+++ b/inference_test.py
@@ -6,9 +6,9 @@
 from bertserini.retriever.pyserini_retriever import retriever, build_searcher
 
 ENG_reader = "BERT"
-do_local_test = True
-do_bm25_test = True
-do_dpr_test = True
+do_local_test = False
+do_bm25_test = False
+do_dpr_test = False
 do_chinese_test = True
 
 if ENG_reader == "BERT":
@@ -51,7 +51,7 @@
     args.encoder = "facebook/dpr-question_encoder-multiset-base"
     args.query_tokenizer_name = "facebook/dpr-question_encoder-multiset-base"
     args.index_path = "../pyserini/dpr-ctx_encoder-multiset-base.ik-nlp-22_slp" # todo: replicate dpr on wiki and release dpr-indexes
-    args.device = "cuda:0"
+    args.device = "cuda:cpu"
     args.sparse_index = "../anserini/lucene-index.ik-nlp-22"
     searcher = build_searcher(args)
     contexts = retriever(question, searcher, 10)

From 6cd951840d5ed82bf14fa2dc8f40719b0629e702 Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Wed, 24 Aug 2022 20:02:47 +0000
Subject: [PATCH 45/50] runable fast-tokenizer, need to fix accuracy issues

---
 bertserini/reader/bert_reader.py           | 230 +++++++++++++++------
 bertserini/retriever/pyserini_retriever.py |   3 +-
 inference_test.py                          |  19 +-
 3 files changed, 174 insertions(+), 78 deletions(-)

diff --git a/bertserini/reader/bert_reader.py b/bertserini/reader/bert_reader.py
index eb210c3..01c02c6 100644
--- a/bertserini/reader/bert_reader.py
+++ b/bertserini/reader/bert_reader.py
@@ -2,16 +2,16 @@
 
 import torch
 from torch.utils.data import DataLoader, SequentialSampler
-from transformers import AutoTokenizer, AutoModelForQuestionAnswering, SquadExample, squad_convert_examples_to_features
-from transformers.data.processors.squad import SquadResult
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering, default_data_collator, EvalPrediction
+from datasets import Dataset
+import numpy as np
 
+from bertserini.utils.squad import SquadExample
+from bertserini.utils.utils_qa import postprocess_qa_predictions
 from bertserini.reader.base import Reader, Question, Context, Answer
-from bertserini.utils.utils_squad_metrics import compute_predictions_logits
 
 __all__ = ['BERT']
 
-from bertserini.train.run_squad import to_list
-
 def craft_squad_examples(question: Question, contexts: List[Context]) -> List[SquadExample]:
     examples = []
     for idx, ctx in enumerate(contexts):
@@ -45,7 +45,7 @@ def __init__(self, args):
             "threads": 1,
             "tqdm_enabled": False,
             "n_best_size": 20,
-            "max_answer_length": 30,
+            "max_answer_length": 384,
             "do_lower_case": True,
             "output_prediction_file": False,
             "output_nbest_file": self.model_args.output_nbest_file,
@@ -59,76 +59,172 @@ def update_args(self, args_to_change):
         for key in args_to_change:
             self.args[key] = args_to_change[key]
 
+
+
     def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
-        examples = craft_squad_examples(question, contexts)
-        features, dataset = squad_convert_examples_to_features(
-            examples=examples,
-            tokenizer=self.tokenizer,
-            max_seq_length=self.args["max_seq_length"],
-            doc_stride=self.args["doc_stride"],
-            max_query_length=self.args["max_query_length"],
-            is_training=False,
-            return_dataset="pt",
-            threads=self.args["threads"],
-            tqdm_enabled=self.args["tqdm_enabled"]
+
+        def prepare_validation_features(examples):
+            question_column_name = "question"
+            context_column_name = "context"
+            # answer_column_name = "answers" if "answers" in column_names else column_names[2]
+            # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+            # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+            # left whitespace
+            examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
+
+            # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+            # in one example possible giving several features when a context is long, each of those features having a
+            # context that overlaps a bit the context of the previous feature.
+            tokenized_examples = self.tokenizer(
+                examples[question_column_name if self.args["pad_on_right"] else context_column_name],
+                examples[context_column_name if self.args["pad_on_right"] else question_column_name],
+                truncation="only_second" if self.args["pad_on_right"] else "only_first",
+                max_length=self.args["max_seq_length"],
+                stride=self.args["doc_stride"],
+                return_overflowing_tokens=True,
+                return_offsets_mapping=True,
+                verbose=False,
+                padding="max_length",
+            )
+
+            # Since one example might give us several features if it has a long context, we need a map from a feature to
+            # its corresponding example. This key gives us just that.
+            sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+
+            # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+            # corresponding example_id and we will store the offset mappings.
+            tokenized_examples["example_id"] = []
+
+            for i in range(len(tokenized_examples["input_ids"])):
+                # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+                sequence_ids = tokenized_examples.sequence_ids(i)
+                context_index = 1 if self.args["pad_on_right"] else 0
+
+                # One example can give several spans, this is the index of the example containing this span of text.
+                sample_index = sample_mapping[i]
+                tokenized_examples["example_id"].append(examples["id"][sample_index])
+
+                # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
+                # position is part of the context or not.
+                tokenized_examples["offset_mapping"][i] = [
+                    (o if sequence_ids[k] == context_index else None)
+                    for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+                ]
+            # print(tokenized_examples)
+            return tokenized_examples
+
+        def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
+            """
+            Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor
+
+            Args:
+                start_or_end_logits(:obj:`tensor`):
+                    This is the output predictions of the model. We can only enter either start or end logits.
+                eval_dataset: Evaluation dataset
+                max_len(:obj:`int`):
+                    The maximum length of the output tensor. ( See the model.eval() part for more details )
+            """
+
+            step = 0
+            # create a numpy array and fill it with -100.
+            logits_concat = np.full((len(dataset), max_len), -100, dtype=np.float64)
+            # Now since we have create an array now we will populate it with the outputs gathered using accelerator.gather
+            for i, output_logit in enumerate(start_or_end_logits):  # populate columns
+                # We have to fill it such that we have to take the whole tensor and replace it on the newly created array
+                # And after every iteration we have to change the step
+
+                batch_size = output_logit.shape[0]
+                cols = output_logit.shape[1]
+
+                if step + batch_size < len(dataset):
+                    logits_concat[step: step + batch_size, :cols] = output_logit
+                else:
+                    logits_concat[step:, :cols] = output_logit[: len(dataset) - step]
+
+                step += batch_size
+
+            return logits_concat
+
+        def post_processing_function(examples, features, predictions, stage="eval"):
+            # Post-processing: we match the start logits and end logits to answers in the original context.
+            _, all_nbest_json = postprocess_qa_predictions(
+                examples=examples,
+                features=features,
+                predictions=predictions,
+                version_2_with_negative=self.args["version_2_with_negative"],
+                n_best_size=self.args["n_best_size"],
+                max_answer_length=self.args["max_answer_length"],
+                null_score_diff_threshold=self.args["null_score_diff_threshold"],
+                output_dir="./tmp/",
+                # output_dir=self.args["output_dir"],
+                # log_level=log_level,
+                prefix=stage,
+            )
+            # print(predictions)
+            # print(all_nbest_json)
+            # Format the result to the format the metric expects.
+            # if self.args["version_2_with_negative"]:
+            #     formatted_predictions = [
+            #         {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
+            #     ]
+            # else:
+            #     formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
+
+            # references = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
+            # return EvalPrediction(predictions=formatted_predictions)#, label_ids=references)
+            return all_nbest_json
+
+        inputs = {"question": [], "context": [], "id": []}
+        for i, ctx in enumerate(contexts):
+            inputs["question"].append(question.text)
+            inputs["context"].append(contexts[i].text)
+            inputs["id"].append(i)
+        eval_examples = Dataset.from_dict(inputs)
+        column_names = eval_examples.column_names
+
+        eval_dataset = eval_examples.map(
+            prepare_validation_features,
+            batched=True,
+            num_proc=1,
+            remove_columns=column_names,
         )
 
-        # Note that DistributedSampler samples randomly
-        eval_sampler = SequentialSampler(dataset)
-        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.model_args.eval_batch_size)
+        eval_dataset_for_model = eval_dataset.remove_columns(["example_id", "offset_mapping"])
 
-        all_results = []
+        eval_dataloader = DataLoader(
+            eval_dataset_for_model,
+            collate_fn=default_data_collator,
+            batch_size=self.model_args.eval_batch_size,
+        )
+        self.model.eval()
+        all_start_logits = []
+        all_end_logits = []
         for batch in eval_dataloader:
-            self.model.eval()
-            batch = tuple(t.to(self.device) for t in batch)
+            for k in batch:
+                batch[k] = batch[k].to(self.device)
             with torch.no_grad():
-                inputs = {
-                    "input_ids": batch[0],
-                    "attention_mask": batch[1],
-                    "token_type_ids": batch[2],
-                }
-                feature_indices = batch[3]
-                outputs = self.model(**inputs)
-
-            for i, feature_index in enumerate(feature_indices):
-                eval_feature = features[feature_index.item()]
-                unique_id = int(eval_feature.unique_id)
-
-                output = [outputs[oname][i] for oname in outputs]
-                start_logits = outputs.start_logits[i]
-                end_logits = outputs.end_logits[i]
-                try:
-                    start_logits = start_logits.item()
-                    end_logits = end_logits.item()
-                except:
-                    pass
-
-                result = SquadResult(unique_id, start_logits, end_logits)
-                all_results.append(result)
-
-        answers, nbest = compute_predictions_logits(
-            all_examples=examples,
-            all_features=features,
-            all_results=all_results,
-            n_best_size=self.args["n_best_size"],
-            max_answer_length=self.args["max_answer_length"],
-            do_lower_case=self.args["do_lower_case"],
-            output_prediction_file=self.args["output_prediction_file"],
-            output_nbest_file=self.args["output_nbest_file"],
-            output_null_log_odds_file=self.args["output_null_log_odds_file"],
-            verbose_logging=self.args["verbose_logging"],
-            version_2_with_negative=self.args["version_2_with_negative"],
-            null_score_diff_threshold=self.args["null_score_diff_threshold"],
-            tokenizer=self.tokenizer,
-            language=question.language
-        )
+                outputs = self.model(**batch)
+                start_logits = outputs.start_logits
+                end_logits = outputs.end_logits
+                all_start_logits.append(start_logits.cpu().numpy())
+                all_end_logits.append(end_logits.cpu().numpy())
 
-        all_answers = []
+        start_logits_concat = create_and_fill_np_array(all_start_logits, eval_dataset, self.args["max_answer_length"])
+        end_logits_concat = create_and_fill_np_array(all_end_logits, eval_dataset,  self.args["max_answer_length"])
+
+        del all_start_logits
+        del all_end_logits
 
-        for idx, ans in enumerate(nbest):
+        outputs_numpy = (start_logits_concat, end_logits_concat)
+
+        all_nbest_json = post_processing_function(eval_examples, eval_dataset, outputs_numpy)
+
+        all_answers = []
+        for idx, ans in enumerate(all_nbest_json):
             all_answers.append(Answer(
-                text=nbest[ans][0]["text"],
-                score=nbest[ans][0]["start_logit"] + nbest[ans][0]["end_logit"],
+                text=all_nbest_json[ans][0]["text"],
+                score=all_nbest_json[ans][0]["probability"],
+                # score=all_nbest_json[ans][0]["start_logit"] + all_nbest_json[ans][0]["end_logit"],
                 ctx_score=contexts[idx].score,
                 language=question.language
             ))
diff --git a/bertserini/retriever/pyserini_retriever.py b/bertserini/retriever/pyserini_retriever.py
index fdd283e..ad17c27 100644
--- a/bertserini/retriever/pyserini_retriever.py
+++ b/bertserini/retriever/pyserini_retriever.py
@@ -86,5 +86,6 @@ def hits_to_contexts(hits: List[JLuceneSearcherResult], language="en", field='ra
             if s in t:
                 continue
         metadata = {}
-        contexts.append(Context(t, language, metadata, score))
+
+        contexts.append(Context(hit, language=language, metadata=metadata, score=score))
     return contexts
diff --git a/inference_test.py b/inference_test.py
index a871a11..182dd69 100644
--- a/inference_test.py
+++ b/inference_test.py
@@ -6,10 +6,10 @@
 from bertserini.retriever.pyserini_retriever import retriever, build_searcher
 
 ENG_reader = "BERT"
-do_local_test = False
-do_bm25_test = False
+do_local_test = True
+do_bm25_test = True
 do_dpr_test = False
-do_chinese_test = True
+do_chinese_test = False
 
 if ENG_reader == "BERT":
     args.model_name_or_path = "rsvp-ai/bertserini-bert-base-squad"
@@ -21,19 +21,18 @@
     args.tokenizer_name = "facebook/dpr-reader-multiset-base"
     bert_reader = DPR(args)
 
-question = Question("Why did Mark Twain call the 19th century the glied age?")
+# question = Question("Why did Mark Twain call the 19th century the glied age?")
+question = Question("Where is the capital of China?")
+
 print(question.text)
 
 if do_local_test:
     print("######################### Testing Local Context #########################")
-    contexts = [Context('The "Gilded Age" was a term that Mark Twain used to describe the period of the late 19th century when there had been a dramatic expansion of American wealth and prosperity.')]
+    contexts = [Context('The "Gilded Age" was a term that Mark Twain used to describe the period of the late 19th century when there had been a dramatic expansion of American wealth and prosperity.'),
+                Context('The "Gilded Age"')]
     candidates = bert_reader.predict(question, contexts)
     answer = get_best_answer(candidates, 1.0)
     print("Answer:", answer.text)
-    if answer.text == "there had been a dramatic expansion of American wealth and prosperity":
-        print("Local Context Test Passed")
-    else:
-        print("Wrong Answer")
 
 if do_bm25_test:
     print("######################### Testing BM25 Context #########################")
@@ -43,7 +42,7 @@
     candidates = bert_reader.predict(question, contexts)
     answer = get_best_answer(candidates, 0.45)
     print("Answer:", answer.text) # todo: no context returned. is the context included? maybe update to another question
-    print("BM25 Test Passed")
+    # print("BM25 Test Passed")
 
 if do_dpr_test:
     print("######################### Testing DPR Context #########################")

From 0ba273f1b53b1377872499347c5d4559576a3e91 Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Wed, 24 Aug 2022 20:04:55 +0000
Subject: [PATCH 46/50] add utils

---
 bertserini/utils/squad.py    | 850 +++++++++++++++++++++++++++++++++++
 bertserini/utils/utils_qa.py | 434 ++++++++++++++++++
 2 files changed, 1284 insertions(+)
 create mode 100644 bertserini/utils/squad.py
 create mode 100644 bertserini/utils/utils_qa.py

diff --git a/bertserini/utils/squad.py b/bertserini/utils/squad.py
new file mode 100644
index 0000000..b2ae5ea
--- /dev/null
+++ b/bertserini/utils/squad.py
@@ -0,0 +1,850 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from functools import partial
+from multiprocessing import Pool, cpu_count
+
+import numpy as np
+from tqdm import tqdm
+
+from transformers.file_utils import is_tf_available, is_torch_available
+from transformers.models.bert.tokenization_bert import whitespace_tokenize
+from transformers.tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
+from transformers.utils import logging
+from transformers.data.processors.utils import DataProcessor
+
+
+# Store the tokenizers which insert 2 separators tokens
+MULTI_SEP_TOKENS_TOKENIZERS_SET = {"roberta", "camembert", "bart", "mpnet"}
+
+
+if is_torch_available():
+    import torch
+    from torch.utils.data import TensorDataset
+
+if is_tf_available():
+    import tensorflow as tf
+
+logger = logging.get_logger(__name__)
+
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
+    """Returns tokenized answer spans that better match the annotated answer."""
+    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+
+    return (input_start, input_end)
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+def _new_check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+    # if len(doc_spans) == 1:
+    # return True
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span["start"] + doc_span["length"] - 1
+        if position < doc_span["start"]:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span["start"]
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+def _is_whitespace(c):
+    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+        return True
+    return False
+
+
+def squad_convert_example_to_features(
+    example, max_seq_length, doc_stride, max_query_length, padding_strategy, is_training
+):
+    features = []
+    if is_training and not example.is_impossible:
+        # Get start and end position
+        start_position = example.start_position
+        end_position = example.end_position
+
+        # If the answer cannot be found in the text, then skip this example.
+        actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
+        cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
+        if actual_text.find(cleaned_answer_text) == -1:
+            logger.warning(f"Could not find answer: '{actual_text}' vs. '{cleaned_answer_text}'")
+            return []
+
+    tok_to_orig_index = []
+    orig_to_tok_index = []
+    all_doc_tokens = []
+    for (i, token) in enumerate(example.doc_tokens):
+        orig_to_tok_index.append(len(all_doc_tokens))
+        if tokenizer.__class__.__name__ in [
+            "RobertaTokenizer",
+            "LongformerTokenizer",
+            "BartTokenizer",
+            "RobertaTokenizerFast",
+            "LongformerTokenizerFast",
+            "BartTokenizerFast",
+        ]:
+            sub_tokens = tokenizer.tokenize(token, add_prefix_space=True)
+        else:
+            sub_tokens = tokenizer.tokenize(token)
+        for sub_token in sub_tokens:
+            tok_to_orig_index.append(i)
+            all_doc_tokens.append(sub_token)
+
+    if is_training and not example.is_impossible:
+        tok_start_position = orig_to_tok_index[example.start_position]
+        if example.end_position < len(example.doc_tokens) - 1:
+            tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+        else:
+            tok_end_position = len(all_doc_tokens) - 1
+
+        (tok_start_position, tok_end_position) = _improve_answer_span(
+            all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
+        )
+
+    spans = []
+
+    truncated_query = tokenizer.encode(
+        example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length
+    )
+
+    # Tokenizers who insert 2 SEP tokens in-between <context> & <question> need to have special handling
+    # in the way they compute mask of added tokens.
+    tokenizer_type = type(tokenizer).__name__.replace("Tokenizer", "").lower()
+    sequence_added_tokens = (
+        tokenizer.model_max_length - tokenizer.max_len_single_sentence + 1
+        if tokenizer_type in MULTI_SEP_TOKENS_TOKENIZERS_SET
+        else tokenizer.model_max_length - tokenizer.max_len_single_sentence
+    )
+    sequence_pair_added_tokens = tokenizer.model_max_length - tokenizer.max_len_sentences_pair
+
+    span_doc_tokens = all_doc_tokens
+    while len(spans) * doc_stride < len(all_doc_tokens):
+
+        # Define the side we want to truncate / pad and the text/pair sorting
+        if tokenizer.padding_side == "right":
+            # texts = truncated_query
+            # pairs = span_doc_tokens
+            truncation = TruncationStrategy.ONLY_SECOND.value
+        else:
+            # texts = span_doc_tokens
+            # pairs = truncated_query
+            truncation = TruncationStrategy.ONLY_FIRST.value
+
+        encoded_dict = tokenizer.encode_plus(  # TODO(thom) update this logic
+            # # texts,
+            # pairs,
+            example.question_text,
+            example.context_text,
+            is_split_into_words=True,
+            truncation=truncation,
+            padding=padding_strategy,
+            max_length=max_seq_length,
+            return_overflowing_tokens=True,
+            stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
+            return_token_type_ids=True,
+        )
+
+        paragraph_len = min(
+            len(all_doc_tokens) - len(spans) * doc_stride,
+            max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
+        )
+
+        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
+            if tokenizer.padding_side == "right":
+                non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
+            else:
+                last_padding_id_position = (
+                    len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id)
+                )
+                non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1 :]
+
+        else:
+            non_padded_ids = encoded_dict["input_ids"]
+        # print(non_padded_ids)
+        tokens = [tokenizer.convert_ids_to_tokens(x) for x in non_padded_ids]
+
+        token_to_orig_map = {}
+        for i in range(paragraph_len):
+            index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
+            token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
+
+        encoded_dict["paragraph_len"] = paragraph_len
+        encoded_dict["tokens"] = tokens
+        encoded_dict["token_to_orig_map"] = token_to_orig_map
+        encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
+        encoded_dict["token_is_max_context"] = {}
+        encoded_dict["start"] = len(spans) * doc_stride
+        encoded_dict["length"] = paragraph_len
+
+        spans.append(encoded_dict)
+
+        if "overflowing_tokens" not in encoded_dict or (
+            "overflowing_tokens" in encoded_dict and len(encoded_dict["overflowing_tokens"]) == 0
+        ):
+            break
+        span_doc_tokens = encoded_dict["overflowing_tokens"]
+
+    for doc_span_index in range(len(spans)):
+        for j in range(spans[doc_span_index]["paragraph_len"]):
+            is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
+            index = (
+                j
+                if tokenizer.padding_side == "left"
+                else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
+            )
+            spans[doc_span_index]["token_is_max_context"][index] = is_max_context
+
+    for span in spans:
+        # Identify the position of the CLS token
+        cls_index = span["input_ids"].index(tokenizer.cls_token_id)
+
+        # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+        # Original TF implementation also keep the classification token (set to 0)
+        p_mask = np.ones_like(span["token_type_ids"])
+        if tokenizer.padding_side == "right":
+            p_mask[len(truncated_query) + sequence_added_tokens :] = 0
+        else:
+            p_mask[-len(span["tokens"]) : -(len(truncated_query) + sequence_added_tokens)] = 0
+
+        pad_token_indices = np.where(span["input_ids"] == tokenizer.pad_token_id)
+        special_token_indices = np.asarray(
+            tokenizer.get_special_tokens_mask(span["input_ids"], already_has_special_tokens=True)
+        ).nonzero()
+
+        p_mask[pad_token_indices] = 1
+        p_mask[special_token_indices] = 1
+
+        # Set the cls index to 0: the CLS index can be used for impossible answers
+        p_mask[cls_index] = 0
+
+        span_is_impossible = example.is_impossible
+        start_position = 0
+        end_position = 0
+        if is_training and not span_is_impossible:
+            # For training, if our document chunk does not contain an annotation
+            # we throw it out, since there is nothing to predict.
+            doc_start = span["start"]
+            doc_end = span["start"] + span["length"] - 1
+            out_of_span = False
+
+            if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
+                out_of_span = True
+
+            if out_of_span:
+                start_position = cls_index
+                end_position = cls_index
+                span_is_impossible = True
+            else:
+                if tokenizer.padding_side == "left":
+                    doc_offset = 0
+                else:
+                    doc_offset = len(truncated_query) + sequence_added_tokens
+
+                start_position = tok_start_position - doc_start + doc_offset
+                end_position = tok_end_position - doc_start + doc_offset
+
+        features.append(
+            SquadFeatures(
+                span["input_ids"],
+                span["attention_mask"],
+                span["token_type_ids"],
+                cls_index,
+                p_mask.tolist(),
+                example_index=0,  # Can not set unique_id and example_index here. They will be set after multiple processing.
+                unique_id=0,
+                paragraph_len=span["paragraph_len"],
+                token_is_max_context=span["token_is_max_context"],
+                tokens=span["tokens"],
+                token_to_orig_map=span["token_to_orig_map"],
+                start_position=start_position,
+                end_position=end_position,
+                is_impossible=span_is_impossible,
+                qas_id=example.qas_id,
+            )
+        )
+    return features
+
+
+def squad_convert_example_to_features_init(tokenizer_for_convert: PreTrainedTokenizerBase):
+    global tokenizer
+    tokenizer = tokenizer_for_convert
+
+
+def squad_convert_examples_to_features(
+    examples,
+    tokenizer,
+    max_seq_length,
+    doc_stride,
+    max_query_length,
+    is_training,
+    padding_strategy="max_length",
+    return_dataset=False,
+    threads=1,
+    tqdm_enabled=True,
+):
+    """
+    Converts a list of examples into a list of features that can be directly given as input to a model. It is
+    model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
+
+    Args:
+        examples: list of [`~data.processors.squad.SquadExample`]
+        tokenizer: an instance of a child of [`PreTrainedTokenizer`]
+        max_seq_length: The maximum sequence length of the inputs.
+        doc_stride: The stride used when the context is too large and is split across several features.
+        max_query_length: The maximum length of the query.
+        is_training: whether to create features for model evaluation or model training.
+        padding_strategy: Default to "max_length". Which padding strategy to use
+        return_dataset: Default False. Either 'pt' or 'tf'.
+            if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset
+        threads: multiple processing threads.
+
+
+    Returns:
+        list of [`~data.processors.squad.SquadFeatures`]
+
+    Example:
+
+    ```python
+    processor = SquadV2Processor()
+    examples = processor.get_dev_examples(data_dir)
+
+    features = squad_convert_examples_to_features(
+        examples=examples,
+        tokenizer=tokenizer,
+        max_seq_length=args.max_seq_length,
+        doc_stride=args.doc_stride,
+        max_query_length=args.max_query_length,
+        is_training=not evaluate,
+    )
+    ```"""
+    # Defining helper methods
+    features = []
+
+    threads = min(threads, cpu_count())
+    with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
+        annotate_ = partial(
+            squad_convert_example_to_features,
+            max_seq_length=max_seq_length,
+            doc_stride=doc_stride,
+            max_query_length=max_query_length,
+            padding_strategy=padding_strategy,
+            is_training=is_training,
+        )
+        features = list(
+            tqdm(
+                p.imap(annotate_, examples, chunksize=32),
+                total=len(examples),
+                desc="convert squad examples to features",
+                disable=not tqdm_enabled,
+            )
+        )
+
+    new_features = []
+    unique_id = 1000000000
+    example_index = 0
+    for example_features in tqdm(
+        features, total=len(features), desc="add example index and unique id", disable=not tqdm_enabled
+    ):
+        if not example_features:
+            continue
+        for example_feature in example_features:
+            example_feature.example_index = example_index
+            example_feature.unique_id = unique_id
+            new_features.append(example_feature)
+            unique_id += 1
+        example_index += 1
+    features = new_features
+    del new_features
+    if return_dataset == "pt":
+        if not is_torch_available():
+            raise RuntimeError("PyTorch must be installed to return a PyTorch dataset.")
+
+        # Convert to Tensors and build dataset
+        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+        all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+        all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
+        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
+        all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float)
+
+        if not is_training:
+            all_feature_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
+            dataset = TensorDataset(
+                all_input_ids, all_attention_masks, all_token_type_ids, all_feature_index, all_cls_index, all_p_mask
+            )
+        else:
+            all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
+            all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
+            dataset = TensorDataset(
+                all_input_ids,
+                all_attention_masks,
+                all_token_type_ids,
+                all_start_positions,
+                all_end_positions,
+                all_cls_index,
+                all_p_mask,
+                all_is_impossible,
+            )
+
+        return features, dataset
+    elif return_dataset == "tf":
+        if not is_tf_available():
+            raise RuntimeError("TensorFlow must be installed to return a TensorFlow dataset.")
+
+        def gen():
+            for i, ex in enumerate(features):
+                if ex.token_type_ids is None:
+                    yield (
+                        {
+                            "input_ids": ex.input_ids,
+                            "attention_mask": ex.attention_mask,
+                            "feature_index": i,
+                            "qas_id": ex.qas_id,
+                        },
+                        {
+                            "start_positions": ex.start_position,
+                            "end_positions": ex.end_position,
+                            "cls_index": ex.cls_index,
+                            "p_mask": ex.p_mask,
+                            "is_impossible": ex.is_impossible,
+                        },
+                    )
+                else:
+                    yield (
+                        {
+                            "input_ids": ex.input_ids,
+                            "attention_mask": ex.attention_mask,
+                            "token_type_ids": ex.token_type_ids,
+                            "feature_index": i,
+                            "qas_id": ex.qas_id,
+                        },
+                        {
+                            "start_positions": ex.start_position,
+                            "end_positions": ex.end_position,
+                            "cls_index": ex.cls_index,
+                            "p_mask": ex.p_mask,
+                            "is_impossible": ex.is_impossible,
+                        },
+                    )
+
+        # Why have we split the batch into a tuple? PyTorch just has a list of tensors.
+        if "token_type_ids" in tokenizer.model_input_names:
+            train_types = (
+                {
+                    "input_ids": tf.int32,
+                    "attention_mask": tf.int32,
+                    "token_type_ids": tf.int32,
+                    "feature_index": tf.int64,
+                    "qas_id": tf.string,
+                },
+                {
+                    "start_positions": tf.int64,
+                    "end_positions": tf.int64,
+                    "cls_index": tf.int64,
+                    "p_mask": tf.int32,
+                    "is_impossible": tf.int32,
+                },
+            )
+
+            train_shapes = (
+                {
+                    "input_ids": tf.TensorShape([None]),
+                    "attention_mask": tf.TensorShape([None]),
+                    "token_type_ids": tf.TensorShape([None]),
+                    "feature_index": tf.TensorShape([]),
+                    "qas_id": tf.TensorShape([]),
+                },
+                {
+                    "start_positions": tf.TensorShape([]),
+                    "end_positions": tf.TensorShape([]),
+                    "cls_index": tf.TensorShape([]),
+                    "p_mask": tf.TensorShape([None]),
+                    "is_impossible": tf.TensorShape([]),
+                },
+            )
+        else:
+            train_types = (
+                {"input_ids": tf.int32, "attention_mask": tf.int32, "feature_index": tf.int64, "qas_id": tf.string},
+                {
+                    "start_positions": tf.int64,
+                    "end_positions": tf.int64,
+                    "cls_index": tf.int64,
+                    "p_mask": tf.int32,
+                    "is_impossible": tf.int32,
+                },
+            )
+
+            train_shapes = (
+                {
+                    "input_ids": tf.TensorShape([None]),
+                    "attention_mask": tf.TensorShape([None]),
+                    "feature_index": tf.TensorShape([]),
+                    "qas_id": tf.TensorShape([]),
+                },
+                {
+                    "start_positions": tf.TensorShape([]),
+                    "end_positions": tf.TensorShape([]),
+                    "cls_index": tf.TensorShape([]),
+                    "p_mask": tf.TensorShape([None]),
+                    "is_impossible": tf.TensorShape([]),
+                },
+            )
+
+        return tf.data.Dataset.from_generator(gen, train_types, train_shapes)
+    else:
+        return features
+
+
+class SquadProcessor(DataProcessor):
+    """
+    Processor for the SQuAD data set. overridden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and
+    version 2.0 of SQuAD, respectively.
+    """
+
+    train_file = None
+    dev_file = None
+
+    def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
+        if not evaluate:
+            answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8")
+            answer_start = tensor_dict["answers"]["answer_start"][0].numpy()
+            answers = []
+        else:
+            answers = [
+                {"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")}
+                for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"])
+            ]
+
+            answer = None
+            answer_start = None
+
+        return SquadExample(
+            qas_id=tensor_dict["id"].numpy().decode("utf-8"),
+            question_text=tensor_dict["question"].numpy().decode("utf-8"),
+            context_text=tensor_dict["context"].numpy().decode("utf-8"),
+            answer_text=answer,
+            start_position_character=answer_start,
+            title=tensor_dict["title"].numpy().decode("utf-8"),
+            answers=answers,
+        )
+
+    def get_examples_from_dataset(self, dataset, evaluate=False):
+        """
+        Creates a list of [`~data.processors.squad.SquadExample`] using a TFDS dataset.
+
+        Args:
+            dataset: The tfds dataset loaded from *tensorflow_datasets.load("squad")*
+            evaluate: Boolean specifying if in evaluation mode or in training mode
+
+        Returns:
+            List of SquadExample
+
+        Examples:
+
+        ```python
+        >>> import tensorflow_datasets as tfds
+
+        >>> dataset = tfds.load("squad")
+
+        >>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
+        >>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
+        ```"""
+
+        if evaluate:
+            dataset = dataset["validation"]
+        else:
+            dataset = dataset["train"]
+
+        examples = []
+        for tensor_dict in tqdm(dataset):
+            examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
+
+        return examples
+
+    def get_train_examples(self, data_dir, filename=None):
+        """
+        Returns the training examples from the data directory.
+
+        Args:
+            data_dir: Directory containing the data files used for training and evaluating.
+            filename: None by default, specify this if the training file has a different name than the original one
+                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
+
+        """
+        if data_dir is None:
+            data_dir = ""
+
+        if self.train_file is None:
+            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
+
+        with open(
+            os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8"
+        ) as reader:
+            input_data = json.load(reader)["data"]
+        return self._create_examples(input_data, "train")
+
+    def get_dev_examples(self, data_dir, filename=None):
+        """
+        Returns the evaluation example from the data directory.
+
+        Args:
+            data_dir: Directory containing the data files used for training and evaluating.
+            filename: None by default, specify this if the evaluation file has a different name than the original one
+                which is `dev-v1.1.json` and `dev-v2.0.json` for squad versions 1.1 and 2.0 respectively.
+        """
+        if data_dir is None:
+            data_dir = ""
+
+        if self.dev_file is None:
+            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
+
+        with open(
+            os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8"
+        ) as reader:
+            input_data = json.load(reader)["data"]
+        return self._create_examples(input_data, "dev")
+
+    def _create_examples(self, input_data, set_type):
+        is_training = set_type == "train"
+        examples = []
+        for entry in tqdm(input_data):
+            title = entry["title"]
+            for paragraph in entry["paragraphs"]:
+                context_text = paragraph["context"]
+                for qa in paragraph["qas"]:
+                    qas_id = qa["id"]
+                    question_text = qa["question"]
+                    start_position_character = None
+                    answer_text = None
+                    answers = []
+
+                    is_impossible = qa.get("is_impossible", False)
+                    if not is_impossible:
+                        if is_training:
+                            answer = qa["answers"][0]
+                            answer_text = answer["text"]
+                            start_position_character = answer["answer_start"]
+                        else:
+                            answers = qa["answers"]
+
+                    example = SquadExample(
+                        qas_id=qas_id,
+                        question_text=question_text,
+                        context_text=context_text,
+                        answer_text=answer_text,
+                        start_position_character=start_position_character,
+                        title=title,
+                        is_impossible=is_impossible,
+                        answers=answers,
+                    )
+                    examples.append(example)
+        return examples
+
+
+class SquadV1Processor(SquadProcessor):
+    train_file = "train-v1.1.json"
+    dev_file = "dev-v1.1.json"
+
+
+class SquadV2Processor(SquadProcessor):
+    train_file = "train-v2.0.json"
+    dev_file = "dev-v2.0.json"
+
+
+class SquadExample:
+    """
+    A single training/test example for the Squad dataset, as loaded from disk.
+
+    Args:
+        qas_id: The example's unique identifier
+        question_text: The question string
+        context_text: The context string
+        answer_text: The answer string
+        start_position_character: The character position of the start of the answer
+        title: The title of the example
+        answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
+        is_impossible: False by default, set to True if the example has no possible answer.
+    """
+
+    def __init__(
+        self,
+        qas_id,
+        question_text,
+        context_text,
+        answer_text,
+        start_position_character,
+        title,
+        answers=[],
+        is_impossible=False,
+    ):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.context_text = context_text
+        self.answer_text = answer_text
+        self.title = title
+        self.is_impossible = is_impossible
+        self.answers = answers
+
+        self.start_position, self.end_position = 0, 0
+
+        doc_tokens = []
+        char_to_word_offset = []
+        prev_is_whitespace = True
+
+        # Split on whitespace so that different tokens may be attributed to their original position.
+        for c in self.context_text:
+            if _is_whitespace(c):
+                prev_is_whitespace = True
+            else:
+                if prev_is_whitespace:
+                    doc_tokens.append(c)
+                else:
+                    doc_tokens[-1] += c
+                prev_is_whitespace = False
+            char_to_word_offset.append(len(doc_tokens) - 1)
+
+        self.doc_tokens = doc_tokens
+        self.char_to_word_offset = char_to_word_offset
+
+        # Start and end positions only has a value during evaluation.
+        if start_position_character is not None and not is_impossible:
+            self.start_position = char_to_word_offset[start_position_character]
+            self.end_position = char_to_word_offset[
+                min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
+            ]
+
+
+class SquadFeatures:
+    """
+    Single squad example features to be fed to a model. Those features are model-specific and can be crafted from
+    [`~data.processors.squad.SquadExample`] using the
+    :method:*~transformers.data.processors.squad.squad_convert_examples_to_features* method.
+
+    Args:
+        input_ids: Indices of input sequence tokens in the vocabulary.
+        attention_mask: Mask to avoid performing attention on padding token indices.
+        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
+        cls_index: the index of the CLS token.
+        p_mask: Mask identifying tokens that can be answers vs. tokens that cannot.
+            Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer
+        example_index: the index of the example
+        unique_id: The unique Feature identifier
+        paragraph_len: The length of the context
+        token_is_max_context:
+            List of booleans identifying which tokens have their maximum context in this feature object. If a token
+            does not have their maximum context in this feature object, it means that another feature object has more
+            information related to that token and should be prioritized over this feature for that token.
+        tokens: list of tokens corresponding to the input ids
+        token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
+        start_position: start of the answer token index
+        end_position: end of the answer token index
+        encoding: optionally store the BatchEncoding with the fast-tokenizer alignment methods.
+    """
+
+    def __init__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        cls_index,
+        p_mask,
+        example_index,
+        unique_id,
+        paragraph_len,
+        token_is_max_context,
+        tokens,
+        token_to_orig_map,
+        start_position,
+        end_position,
+        is_impossible,
+        qas_id: str = None,
+        encoding: BatchEncoding = None,
+    ):
+        self.input_ids = input_ids
+        self.attention_mask = attention_mask
+        self.token_type_ids = token_type_ids
+        self.cls_index = cls_index
+        self.p_mask = p_mask
+
+        self.example_index = example_index
+        self.unique_id = unique_id
+        self.paragraph_len = paragraph_len
+        self.token_is_max_context = token_is_max_context
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+        self.qas_id = qas_id
+
+        self.encoding = encoding
+
+
+class SquadResult:
+    """
+    Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.
+
+    Args:
+        unique_id: The unique identifier corresponding to that example.
+        start_logits: The logits corresponding to the start of the answer
+        end_logits: The logits corresponding to the end of the answer
+    """
+
+    def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
+        self.start_logits = start_logits
+        self.end_logits = end_logits
+        self.unique_id = unique_id
+
+        if start_top_index:
+            self.start_top_index = start_top_index
+            self.end_top_index = end_top_index
+            self.cls_logits = cls_logits
diff --git a/bertserini/utils/utils_qa.py b/bertserini/utils/utils_qa.py
new file mode 100644
index 0000000..a73cd0a
--- /dev/null
+++ b/bertserini/utils/utils_qa.py
@@ -0,0 +1,434 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Post-processing utilities for question answering.
+"""
+import collections
+import json
+import logging
+import os
+from typing import Optional, Tuple
+
+import numpy as np
+from tqdm.auto import tqdm
+
+
+logger = logging.getLogger(__name__)
+
+
+def postprocess_qa_predictions(
+    examples,
+    features,
+    predictions: Tuple[np.ndarray, np.ndarray],
+    version_2_with_negative: bool = False,
+    n_best_size: int = 20,
+    max_answer_length: int = 30,
+    null_score_diff_threshold: float = 0.0,
+    output_dir: Optional[str] = None,
+    prefix: Optional[str] = None,
+    log_level: Optional[int] = logging.WARNING,
+):
+    """
+    Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
+    original contexts. This is the base postprocessing functions for models that only return start and end logits.
+
+    Args:
+        examples: The non-preprocessed dataset (see the main script for more information).
+        features: The processed dataset (see the main script for more information).
+        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
+            first dimension must match the number of elements of :obj:`features`.
+        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the underlying dataset contains examples with no answers.
+        n_best_size (:obj:`int`, `optional`, defaults to 20):
+            The total number of n-best predictions to generate when looking for an answer.
+        max_answer_length (:obj:`int`, `optional`, defaults to 30):
+            The maximum length of an answer that can be generated. This is needed because the start and end predictions
+            are not conditioned on one another.
+        null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0):
+            The threshold used to select the null answer: if the best answer has a score that is less than the score of
+            the null answer minus this threshold, the null answer is selected for this example (note that the score of
+            the null answer for an example giving several features is the minimum of the scores for the null answer on
+            each feature: all features must be aligned on the fact they `want` to predict a null answer).
+
+            Only useful when :obj:`version_2_with_negative` is :obj:`True`.
+        output_dir (:obj:`str`, `optional`):
+            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
+            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
+            answers, are saved in `output_dir`.
+        prefix (:obj:`str`, `optional`):
+            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
+        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+            ``logging`` log level (e.g., ``logging.WARNING``)
+    """
+    if len(predictions) != 2:
+        raise ValueError("`predictions` should be a tuple with two elements (start_logits, end_logits).")
+    all_start_logits, all_end_logits = predictions
+
+    if len(predictions[0]) != len(features):
+        raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.")
+
+    # Build a map example to its corresponding features.
+    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+    features_per_example = collections.defaultdict(list)
+    for i, feature in enumerate(features):
+        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
+
+    # The dictionaries we have to fill.
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    if version_2_with_negative:
+        scores_diff_json = collections.OrderedDict()
+
+    # Logging.
+    logger.setLevel(log_level)
+    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
+
+    # Let's loop over all the examples!
+    for example_index, example in enumerate(tqdm(examples)):
+        # Those are the indices of the features associated to the current example.
+        feature_indices = features_per_example[example_index]
+
+        min_null_prediction = None
+        prelim_predictions = []
+
+        # Looping through all the features associated to the current example.
+        for feature_index in feature_indices:
+            # We grab the predictions of the model for this feature.
+            start_logits = all_start_logits[feature_index]
+            end_logits = all_end_logits[feature_index]
+            # This is what will allow us to map some the positions in our logits to span of texts in the original
+            # context.
+            offset_mapping = features[feature_index]["offset_mapping"]
+            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
+            # available in the current feature.
+            token_is_max_context = features[feature_index].get("token_is_max_context", None)
+
+            # Update minimum null prediction.
+            feature_null_score = start_logits[0] + end_logits[0]
+            if min_null_prediction is None or min_null_prediction["score"] > feature_null_score:
+                min_null_prediction = {
+                    "offsets": (0, 0),
+                    "score": feature_null_score,
+                    "start_logit": start_logits[0],
+                    "end_logit": end_logits[0],
+                }
+
+            # Go through all possibilities for the `n_best_size` greater start and end logits.
+            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
+            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
+                    # to part of the input_ids that are not in the context.
+                    if (
+                        start_index >= len(offset_mapping)
+                        or end_index >= len(offset_mapping)
+                        or offset_mapping[start_index] is None
+                        or len(offset_mapping[start_index]) < 2
+                        or offset_mapping[end_index] is None
+                        or len(offset_mapping[end_index]) < 2
+                    ):
+                        continue
+                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
+                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
+                        continue
+                    # Don't consider answer that don't have the maximum context available (if such information is
+                    # provided).
+                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
+                        continue
+
+                    prelim_predictions.append(
+                        {
+                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
+                            "score": start_logits[start_index] + end_logits[end_index],
+                            "start_logit": start_logits[start_index],
+                            "end_logit": end_logits[end_index],
+                        }
+                    )
+        if version_2_with_negative:
+            # Add the minimum null prediction
+            prelim_predictions.append(min_null_prediction)
+            null_score = min_null_prediction["score"]
+
+        # Only keep the best `n_best_size` predictions.
+        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
+
+        # Add back the minimum null prediction if it was removed because of its low score.
+        if version_2_with_negative and not any(p["offsets"] == (0, 0) for p in predictions):
+            predictions.append(min_null_prediction)
+
+        # Use the offsets to gather the answer text in the original context.
+        context = example["context"]
+        for pred in predictions:
+            offsets = pred.pop("offsets")
+            pred["text"] = context[offsets[0] : offsets[1]]
+
+        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
+        # failure.
+        if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""):
+            predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0})
+
+        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
+        # the LogSumExp trick).
+        probs = np.array([pred.pop("score") for pred in predictions])
+        # exp_scores = np.exp(scores - np.max(scores))
+        # probs = exp_scores / exp_scores.sum()
+
+        # Include the probabilities in our predictions.
+        for prob, pred in zip(probs, predictions):
+            pred["probability"] = prob
+
+        # Pick the best prediction. If the null answer is not possible, this is easy.
+        if not version_2_with_negative:
+            all_predictions[example["id"]] = predictions[0]["text"]
+        else:
+            # Otherwise we first need to find the best non-empty prediction.
+            i = 0
+            while predictions[i]["text"] == "":
+                i += 1
+            best_non_null_pred = predictions[i]
+
+            # Then we compare to the null prediction using the threshold.
+            score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"]
+            scores_diff_json[example["id"]] = float(score_diff)  # To be JSON-serializable.
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example["id"]] = ""
+            else:
+                all_predictions[example["id"]] = best_non_null_pred["text"]
+
+        # Make `predictions` JSON-serializable by casting np.float back to float.
+        all_nbest_json[example["id"]] = [
+            {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
+            for pred in predictions
+        ]
+
+    # If we have an output_dir, let's save all those dicts.
+    if output_dir is not None:
+        if not os.path.isdir(output_dir):
+            raise EnvironmentError(f"{output_dir} is not a directory.")
+
+        prediction_file = os.path.join(
+            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
+        )
+        nbest_file = os.path.join(
+            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
+        )
+        if version_2_with_negative:
+            null_odds_file = os.path.join(
+                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
+            )
+
+        logger.info(f"Saving predictions to {prediction_file}.")
+        with open(prediction_file, "w") as writer:
+            writer.write(json.dumps(all_predictions, indent=4) + "\n")
+        logger.info(f"Saving nbest_preds to {nbest_file}.")
+        with open(nbest_file, "w") as writer:
+            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+        if version_2_with_negative:
+            logger.info(f"Saving null_odds to {null_odds_file}.")
+            with open(null_odds_file, "w") as writer:
+                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions, all_nbest_json
+
+
+def postprocess_qa_predictions_with_beam_search(
+    examples,
+    features,
+    predictions: Tuple[np.ndarray, np.ndarray],
+    version_2_with_negative: bool = False,
+    n_best_size: int = 20,
+    max_answer_length: int = 30,
+    start_n_top: int = 5,
+    end_n_top: int = 5,
+    output_dir: Optional[str] = None,
+    prefix: Optional[str] = None,
+    log_level: Optional[int] = logging.WARNING,
+):
+    """
+    Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the
+    original contexts. This is the postprocessing functions for models that return start and end logits, indices, as well as
+    cls token predictions.
+
+    Args:
+        examples: The non-preprocessed dataset (see the main script for more information).
+        features: The processed dataset (see the main script for more information).
+        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
+            first dimension must match the number of elements of :obj:`features`.
+        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the underlying dataset contains examples with no answers.
+        n_best_size (:obj:`int`, `optional`, defaults to 20):
+            The total number of n-best predictions to generate when looking for an answer.
+        max_answer_length (:obj:`int`, `optional`, defaults to 30):
+            The maximum length of an answer that can be generated. This is needed because the start and end predictions
+            are not conditioned on one another.
+        start_n_top (:obj:`int`, `optional`, defaults to 5):
+            The number of top start logits too keep when searching for the :obj:`n_best_size` predictions.
+        end_n_top (:obj:`int`, `optional`, defaults to 5):
+            The number of top end logits too keep when searching for the :obj:`n_best_size` predictions.
+        output_dir (:obj:`str`, `optional`):
+            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
+            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
+            answers, are saved in `output_dir`.
+        prefix (:obj:`str`, `optional`):
+            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
+        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+            ``logging`` log level (e.g., ``logging.WARNING``)
+    """
+    if len(predictions) != 5:
+        raise ValueError("`predictions` should be a tuple with five elements.")
+    start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions
+
+    if len(predictions[0]) != len(features):
+        raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.")
+
+    # Build a map example to its corresponding features.
+    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+    features_per_example = collections.defaultdict(list)
+    for i, feature in enumerate(features):
+        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
+
+    # The dictionaries we have to fill.
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict() if version_2_with_negative else None
+
+    # Logging.
+    logger.setLevel(log_level)
+    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
+
+    # Let's loop over all the examples!
+    for example_index, example in enumerate(tqdm(examples)):
+        # Those are the indices of the features associated to the current example.
+        feature_indices = features_per_example[example_index]
+
+        min_null_score = None
+        prelim_predictions = []
+
+        # Looping through all the features associated to the current example.
+        for feature_index in feature_indices:
+            # We grab the predictions of the model for this feature.
+            start_log_prob = start_top_log_probs[feature_index]
+            start_indexes = start_top_index[feature_index]
+            end_log_prob = end_top_log_probs[feature_index]
+            end_indexes = end_top_index[feature_index]
+            feature_null_score = cls_logits[feature_index]
+            # This is what will allow us to map some the positions in our logits to span of texts in the original
+            # context.
+            offset_mapping = features[feature_index]["offset_mapping"]
+            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
+            # available in the current feature.
+            token_is_max_context = features[feature_index].get("token_is_max_context", None)
+
+            # Update minimum null prediction
+            if min_null_score is None or feature_null_score < min_null_score:
+                min_null_score = feature_null_score
+
+            # Go through all possibilities for the `n_start_top`/`n_end_top` greater start and end logits.
+            for i in range(start_n_top):
+                for j in range(end_n_top):
+                    start_index = int(start_indexes[i])
+                    j_index = i * end_n_top + j
+                    end_index = int(end_indexes[j_index])
+                    # Don't consider out-of-scope answers (last part of the test should be unnecessary because of the
+                    # p_mask but let's not take any risk)
+                    if (
+                        start_index >= len(offset_mapping)
+                        or end_index >= len(offset_mapping)
+                        or offset_mapping[start_index] is None
+                        or offset_mapping[end_index] is None
+                    ):
+                        continue
+                    # Don't consider answers with a length negative or > max_answer_length.
+                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
+                        continue
+                    # Don't consider answer that don't have the maximum context available (if such information is
+                    # provided).
+                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
+                        continue
+                    prelim_predictions.append(
+                        {
+                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
+                            "score": start_log_prob[i] + end_log_prob[j_index],
+                            "start_log_prob": start_log_prob[i],
+                            "end_log_prob": end_log_prob[j_index],
+                        }
+                    )
+
+        # Only keep the best `n_best_size` predictions.
+        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
+
+        # Use the offsets to gather the answer text in the original context.
+        context = example["context"]
+        for pred in predictions:
+            offsets = pred.pop("offsets")
+            pred["text"] = context[offsets[0] : offsets[1]]
+
+        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
+        # failure.
+        if len(predictions) == 0:
+            predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": -2e-6})
+
+        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
+        # the LogSumExp trick).
+        scores = np.array([pred.pop("score") for pred in predictions])
+        exp_scores = np.exp(scores - np.max(scores))
+        probs = exp_scores / exp_scores.sum()
+
+        # Include the probabilities in our predictions.
+        for prob, pred in zip(probs, predictions):
+            pred["probability"] = prob
+
+        # Pick the best prediction and set the probability for the null answer.
+        all_predictions[example["id"]] = predictions[0]["text"]
+        if version_2_with_negative:
+            scores_diff_json[example["id"]] = float(min_null_score)
+
+        # Make `predictions` JSON-serializable by casting np.float back to float.
+        all_nbest_json[example["id"]] = [
+            {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
+            for pred in predictions
+        ]
+
+    # If we have an output_dir, let's save all those dicts.
+    if output_dir is not None:
+        if not os.path.isdir(output_dir):
+            raise EnvironmentError(f"{output_dir} is not a directory.")
+
+        prediction_file = os.path.join(
+            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
+        )
+        nbest_file = os.path.join(
+            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
+        )
+        if version_2_with_negative:
+            null_odds_file = os.path.join(
+                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
+            )
+
+        logger.info(f"Saving predictions to {prediction_file}.")
+        with open(prediction_file, "w") as writer:
+            writer.write(json.dumps(all_predictions, indent=4) + "\n")
+        logger.info(f"Saving nbest_preds to {nbest_file}.")
+        with open(nbest_file, "w") as writer:
+            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+        if version_2_with_negative:
+            logger.info(f"Saving null_odds to {null_odds_file}.")
+            with open(null_odds_file, "w") as writer:
+                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions, scores_diff_json

From 676ef62457681b186e1e763ac3e2819b8208de6c Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Wed, 24 Aug 2022 20:12:24 +0000
Subject: [PATCH 47/50] refactor utils and fix minor bug

---
 README.md                           |  16 +-
 bertserini/experiments/inference.py |   2 +-
 bertserini/reader/bert_reader.py    |  20 +-
 bertserini/utils/squad.py           | 850 ----------------------------
 bertserini/utils/utils.py           |  26 +
 bertserini/utils/utils_new.py       |  28 -
 inference_test.py                   |   2 +-
 7 files changed, 43 insertions(+), 901 deletions(-)
 delete mode 100644 bertserini/utils/squad.py
 delete mode 100644 bertserini/utils/utils_new.py

diff --git a/README.md b/README.md
index 62aa78d..1028832 100644
--- a/README.md
+++ b/README.md
@@ -11,10 +11,22 @@ We demonstrate an end-to-end Open-Domain question answering system that integrat
 Following the Open Domain QA setting of DrQA, we are using Wikipedia as the large scale knowledge source of documents. The system first retrieves several candidate text segmentations among the entire knowledge source of documents, then read through the candidate text segments to determine the answers.
 
 ## Package Installation
+
 ```
-pip install bertserini
+conda create -n bertserini python==3.8.0
+conda activate bertserini
+conda install tqdm
+pip install transformers==4.17
+pip install pyserini==0.17.0
+conda install -c pytorch faiss-gpu
+pip install hanziconv
+pip install zhon
+pip install tensorboard
 ```
 
+Also, install pytorch following instructions here: https://pytorch.org/get-started/locally/
+
+
 ## Development Installation
 BERTserini requires Python 3.6+ and a couple Python dependencies. 
 The repo is tested on Python 3.6, Cuda 10.1, PyTorch 1.5.1 on Tesla P40 GPUs.
@@ -33,7 +45,7 @@ Below is a example for English Question-Answering. We also provide an example fo
 ```python
 from bertserini.reader.base import Question, Context
 from bertserini.reader.bert_reader import BERT
-from bertserini.utils.utils_new import get_best_answer
+from bertserini.utils.utils import get_best_answer
 
 model_name = "rsvp-ai/bertserini-bert-base-squad"
 tokenizer_name = "rsvp-ai/bertserini-bert-base-squad"
diff --git a/bertserini/experiments/inference.py b/bertserini/experiments/inference.py
index d6e7761..c7c9ad9 100644
--- a/bertserini/experiments/inference.py
+++ b/bertserini/experiments/inference.py
@@ -2,7 +2,7 @@
 from tqdm import tqdm
 from bertserini.reader.bert_reader import BERT
 from bertserini.retriever.pyserini_retriever import retriever, build_searcher
-from bertserini.utils.utils_new import extract_squad_questions
+from bertserini.utils.utils import extract_squad_questions
 from bertserini.experiments.args import *
 import time
 
diff --git a/bertserini/reader/bert_reader.py b/bertserini/reader/bert_reader.py
index 01c02c6..0582a3b 100644
--- a/bertserini/reader/bert_reader.py
+++ b/bertserini/reader/bert_reader.py
@@ -6,30 +6,11 @@
 from datasets import Dataset
 import numpy as np
 
-from bertserini.utils.squad import SquadExample
 from bertserini.utils.utils_qa import postprocess_qa_predictions
 from bertserini.reader.base import Reader, Question, Context, Answer
 
 __all__ = ['BERT']
 
-def craft_squad_examples(question: Question, contexts: List[Context]) -> List[SquadExample]:
-    examples = []
-    for idx, ctx in enumerate(contexts):
-        examples.append(
-            SquadExample(
-                qas_id=idx,
-                question_text=question.text,
-                context_text=ctx.text,
-                answer_text=None,
-                start_position_character=None,
-                title="",
-                is_impossible=False,
-                answers=[],
-            )
-        )
-    return examples
-
-
 class BERT(Reader):
     def __init__(self, args):
         self.model_args = args
@@ -53,6 +34,7 @@ def __init__(self, args):
             "verbose_logging": False,
             "version_2_with_negative": True,
             "null_score_diff_threshold": 0,
+            "pad_on_right": False,
         }
 
     def update_args(self, args_to_change):
diff --git a/bertserini/utils/squad.py b/bertserini/utils/squad.py
deleted file mode 100644
index b2ae5ea..0000000
--- a/bertserini/utils/squad.py
+++ /dev/null
@@ -1,850 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-from functools import partial
-from multiprocessing import Pool, cpu_count
-
-import numpy as np
-from tqdm import tqdm
-
-from transformers.file_utils import is_tf_available, is_torch_available
-from transformers.models.bert.tokenization_bert import whitespace_tokenize
-from transformers.tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
-from transformers.utils import logging
-from transformers.data.processors.utils import DataProcessor
-
-
-# Store the tokenizers which insert 2 separators tokens
-MULTI_SEP_TOKENS_TOKENIZERS_SET = {"roberta", "camembert", "bart", "mpnet"}
-
-
-if is_torch_available():
-    import torch
-    from torch.utils.data import TensorDataset
-
-if is_tf_available():
-    import tensorflow as tf
-
-logger = logging.get_logger(__name__)
-
-
-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
-    """Returns tokenized answer spans that better match the annotated answer."""
-    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
-
-    for new_start in range(input_start, input_end + 1):
-        for new_end in range(input_end, new_start - 1, -1):
-            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
-            if text_span == tok_answer_text:
-                return (new_start, new_end)
-
-    return (input_start, input_end)
-
-
-def _check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token."""
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span.start + doc_span.length - 1
-        if position < doc_span.start:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span.start
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-
-    return cur_span_index == best_span_index
-
-
-def _new_check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token."""
-    # if len(doc_spans) == 1:
-    # return True
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span["start"] + doc_span["length"] - 1
-        if position < doc_span["start"]:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span["start"]
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-
-    return cur_span_index == best_span_index
-
-
-def _is_whitespace(c):
-    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-        return True
-    return False
-
-
-def squad_convert_example_to_features(
-    example, max_seq_length, doc_stride, max_query_length, padding_strategy, is_training
-):
-    features = []
-    if is_training and not example.is_impossible:
-        # Get start and end position
-        start_position = example.start_position
-        end_position = example.end_position
-
-        # If the answer cannot be found in the text, then skip this example.
-        actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
-        cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
-        if actual_text.find(cleaned_answer_text) == -1:
-            logger.warning(f"Could not find answer: '{actual_text}' vs. '{cleaned_answer_text}'")
-            return []
-
-    tok_to_orig_index = []
-    orig_to_tok_index = []
-    all_doc_tokens = []
-    for (i, token) in enumerate(example.doc_tokens):
-        orig_to_tok_index.append(len(all_doc_tokens))
-        if tokenizer.__class__.__name__ in [
-            "RobertaTokenizer",
-            "LongformerTokenizer",
-            "BartTokenizer",
-            "RobertaTokenizerFast",
-            "LongformerTokenizerFast",
-            "BartTokenizerFast",
-        ]:
-            sub_tokens = tokenizer.tokenize(token, add_prefix_space=True)
-        else:
-            sub_tokens = tokenizer.tokenize(token)
-        for sub_token in sub_tokens:
-            tok_to_orig_index.append(i)
-            all_doc_tokens.append(sub_token)
-
-    if is_training and not example.is_impossible:
-        tok_start_position = orig_to_tok_index[example.start_position]
-        if example.end_position < len(example.doc_tokens) - 1:
-            tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
-        else:
-            tok_end_position = len(all_doc_tokens) - 1
-
-        (tok_start_position, tok_end_position) = _improve_answer_span(
-            all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
-        )
-
-    spans = []
-
-    truncated_query = tokenizer.encode(
-        example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length
-    )
-
-    # Tokenizers who insert 2 SEP tokens in-between <context> & <question> need to have special handling
-    # in the way they compute mask of added tokens.
-    tokenizer_type = type(tokenizer).__name__.replace("Tokenizer", "").lower()
-    sequence_added_tokens = (
-        tokenizer.model_max_length - tokenizer.max_len_single_sentence + 1
-        if tokenizer_type in MULTI_SEP_TOKENS_TOKENIZERS_SET
-        else tokenizer.model_max_length - tokenizer.max_len_single_sentence
-    )
-    sequence_pair_added_tokens = tokenizer.model_max_length - tokenizer.max_len_sentences_pair
-
-    span_doc_tokens = all_doc_tokens
-    while len(spans) * doc_stride < len(all_doc_tokens):
-
-        # Define the side we want to truncate / pad and the text/pair sorting
-        if tokenizer.padding_side == "right":
-            # texts = truncated_query
-            # pairs = span_doc_tokens
-            truncation = TruncationStrategy.ONLY_SECOND.value
-        else:
-            # texts = span_doc_tokens
-            # pairs = truncated_query
-            truncation = TruncationStrategy.ONLY_FIRST.value
-
-        encoded_dict = tokenizer.encode_plus(  # TODO(thom) update this logic
-            # # texts,
-            # pairs,
-            example.question_text,
-            example.context_text,
-            is_split_into_words=True,
-            truncation=truncation,
-            padding=padding_strategy,
-            max_length=max_seq_length,
-            return_overflowing_tokens=True,
-            stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
-            return_token_type_ids=True,
-        )
-
-        paragraph_len = min(
-            len(all_doc_tokens) - len(spans) * doc_stride,
-            max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
-        )
-
-        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
-            if tokenizer.padding_side == "right":
-                non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
-            else:
-                last_padding_id_position = (
-                    len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id)
-                )
-                non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1 :]
-
-        else:
-            non_padded_ids = encoded_dict["input_ids"]
-        # print(non_padded_ids)
-        tokens = [tokenizer.convert_ids_to_tokens(x) for x in non_padded_ids]
-
-        token_to_orig_map = {}
-        for i in range(paragraph_len):
-            index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
-            token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
-
-        encoded_dict["paragraph_len"] = paragraph_len
-        encoded_dict["tokens"] = tokens
-        encoded_dict["token_to_orig_map"] = token_to_orig_map
-        encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
-        encoded_dict["token_is_max_context"] = {}
-        encoded_dict["start"] = len(spans) * doc_stride
-        encoded_dict["length"] = paragraph_len
-
-        spans.append(encoded_dict)
-
-        if "overflowing_tokens" not in encoded_dict or (
-            "overflowing_tokens" in encoded_dict and len(encoded_dict["overflowing_tokens"]) == 0
-        ):
-            break
-        span_doc_tokens = encoded_dict["overflowing_tokens"]
-
-    for doc_span_index in range(len(spans)):
-        for j in range(spans[doc_span_index]["paragraph_len"]):
-            is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
-            index = (
-                j
-                if tokenizer.padding_side == "left"
-                else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
-            )
-            spans[doc_span_index]["token_is_max_context"][index] = is_max_context
-
-    for span in spans:
-        # Identify the position of the CLS token
-        cls_index = span["input_ids"].index(tokenizer.cls_token_id)
-
-        # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
-        # Original TF implementation also keep the classification token (set to 0)
-        p_mask = np.ones_like(span["token_type_ids"])
-        if tokenizer.padding_side == "right":
-            p_mask[len(truncated_query) + sequence_added_tokens :] = 0
-        else:
-            p_mask[-len(span["tokens"]) : -(len(truncated_query) + sequence_added_tokens)] = 0
-
-        pad_token_indices = np.where(span["input_ids"] == tokenizer.pad_token_id)
-        special_token_indices = np.asarray(
-            tokenizer.get_special_tokens_mask(span["input_ids"], already_has_special_tokens=True)
-        ).nonzero()
-
-        p_mask[pad_token_indices] = 1
-        p_mask[special_token_indices] = 1
-
-        # Set the cls index to 0: the CLS index can be used for impossible answers
-        p_mask[cls_index] = 0
-
-        span_is_impossible = example.is_impossible
-        start_position = 0
-        end_position = 0
-        if is_training and not span_is_impossible:
-            # For training, if our document chunk does not contain an annotation
-            # we throw it out, since there is nothing to predict.
-            doc_start = span["start"]
-            doc_end = span["start"] + span["length"] - 1
-            out_of_span = False
-
-            if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
-                out_of_span = True
-
-            if out_of_span:
-                start_position = cls_index
-                end_position = cls_index
-                span_is_impossible = True
-            else:
-                if tokenizer.padding_side == "left":
-                    doc_offset = 0
-                else:
-                    doc_offset = len(truncated_query) + sequence_added_tokens
-
-                start_position = tok_start_position - doc_start + doc_offset
-                end_position = tok_end_position - doc_start + doc_offset
-
-        features.append(
-            SquadFeatures(
-                span["input_ids"],
-                span["attention_mask"],
-                span["token_type_ids"],
-                cls_index,
-                p_mask.tolist(),
-                example_index=0,  # Can not set unique_id and example_index here. They will be set after multiple processing.
-                unique_id=0,
-                paragraph_len=span["paragraph_len"],
-                token_is_max_context=span["token_is_max_context"],
-                tokens=span["tokens"],
-                token_to_orig_map=span["token_to_orig_map"],
-                start_position=start_position,
-                end_position=end_position,
-                is_impossible=span_is_impossible,
-                qas_id=example.qas_id,
-            )
-        )
-    return features
-
-
-def squad_convert_example_to_features_init(tokenizer_for_convert: PreTrainedTokenizerBase):
-    global tokenizer
-    tokenizer = tokenizer_for_convert
-
-
-def squad_convert_examples_to_features(
-    examples,
-    tokenizer,
-    max_seq_length,
-    doc_stride,
-    max_query_length,
-    is_training,
-    padding_strategy="max_length",
-    return_dataset=False,
-    threads=1,
-    tqdm_enabled=True,
-):
-    """
-    Converts a list of examples into a list of features that can be directly given as input to a model. It is
-    model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
-
-    Args:
-        examples: list of [`~data.processors.squad.SquadExample`]
-        tokenizer: an instance of a child of [`PreTrainedTokenizer`]
-        max_seq_length: The maximum sequence length of the inputs.
-        doc_stride: The stride used when the context is too large and is split across several features.
-        max_query_length: The maximum length of the query.
-        is_training: whether to create features for model evaluation or model training.
-        padding_strategy: Default to "max_length". Which padding strategy to use
-        return_dataset: Default False. Either 'pt' or 'tf'.
-            if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset
-        threads: multiple processing threads.
-
-
-    Returns:
-        list of [`~data.processors.squad.SquadFeatures`]
-
-    Example:
-
-    ```python
-    processor = SquadV2Processor()
-    examples = processor.get_dev_examples(data_dir)
-
-    features = squad_convert_examples_to_features(
-        examples=examples,
-        tokenizer=tokenizer,
-        max_seq_length=args.max_seq_length,
-        doc_stride=args.doc_stride,
-        max_query_length=args.max_query_length,
-        is_training=not evaluate,
-    )
-    ```"""
-    # Defining helper methods
-    features = []
-
-    threads = min(threads, cpu_count())
-    with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
-        annotate_ = partial(
-            squad_convert_example_to_features,
-            max_seq_length=max_seq_length,
-            doc_stride=doc_stride,
-            max_query_length=max_query_length,
-            padding_strategy=padding_strategy,
-            is_training=is_training,
-        )
-        features = list(
-            tqdm(
-                p.imap(annotate_, examples, chunksize=32),
-                total=len(examples),
-                desc="convert squad examples to features",
-                disable=not tqdm_enabled,
-            )
-        )
-
-    new_features = []
-    unique_id = 1000000000
-    example_index = 0
-    for example_features in tqdm(
-        features, total=len(features), desc="add example index and unique id", disable=not tqdm_enabled
-    ):
-        if not example_features:
-            continue
-        for example_feature in example_features:
-            example_feature.example_index = example_index
-            example_feature.unique_id = unique_id
-            new_features.append(example_feature)
-            unique_id += 1
-        example_index += 1
-    features = new_features
-    del new_features
-    if return_dataset == "pt":
-        if not is_torch_available():
-            raise RuntimeError("PyTorch must be installed to return a PyTorch dataset.")
-
-        # Convert to Tensors and build dataset
-        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-        all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
-        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
-        all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
-        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
-        all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float)
-
-        if not is_training:
-            all_feature_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-            dataset = TensorDataset(
-                all_input_ids, all_attention_masks, all_token_type_ids, all_feature_index, all_cls_index, all_p_mask
-            )
-        else:
-            all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
-            all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
-            dataset = TensorDataset(
-                all_input_ids,
-                all_attention_masks,
-                all_token_type_ids,
-                all_start_positions,
-                all_end_positions,
-                all_cls_index,
-                all_p_mask,
-                all_is_impossible,
-            )
-
-        return features, dataset
-    elif return_dataset == "tf":
-        if not is_tf_available():
-            raise RuntimeError("TensorFlow must be installed to return a TensorFlow dataset.")
-
-        def gen():
-            for i, ex in enumerate(features):
-                if ex.token_type_ids is None:
-                    yield (
-                        {
-                            "input_ids": ex.input_ids,
-                            "attention_mask": ex.attention_mask,
-                            "feature_index": i,
-                            "qas_id": ex.qas_id,
-                        },
-                        {
-                            "start_positions": ex.start_position,
-                            "end_positions": ex.end_position,
-                            "cls_index": ex.cls_index,
-                            "p_mask": ex.p_mask,
-                            "is_impossible": ex.is_impossible,
-                        },
-                    )
-                else:
-                    yield (
-                        {
-                            "input_ids": ex.input_ids,
-                            "attention_mask": ex.attention_mask,
-                            "token_type_ids": ex.token_type_ids,
-                            "feature_index": i,
-                            "qas_id": ex.qas_id,
-                        },
-                        {
-                            "start_positions": ex.start_position,
-                            "end_positions": ex.end_position,
-                            "cls_index": ex.cls_index,
-                            "p_mask": ex.p_mask,
-                            "is_impossible": ex.is_impossible,
-                        },
-                    )
-
-        # Why have we split the batch into a tuple? PyTorch just has a list of tensors.
-        if "token_type_ids" in tokenizer.model_input_names:
-            train_types = (
-                {
-                    "input_ids": tf.int32,
-                    "attention_mask": tf.int32,
-                    "token_type_ids": tf.int32,
-                    "feature_index": tf.int64,
-                    "qas_id": tf.string,
-                },
-                {
-                    "start_positions": tf.int64,
-                    "end_positions": tf.int64,
-                    "cls_index": tf.int64,
-                    "p_mask": tf.int32,
-                    "is_impossible": tf.int32,
-                },
-            )
-
-            train_shapes = (
-                {
-                    "input_ids": tf.TensorShape([None]),
-                    "attention_mask": tf.TensorShape([None]),
-                    "token_type_ids": tf.TensorShape([None]),
-                    "feature_index": tf.TensorShape([]),
-                    "qas_id": tf.TensorShape([]),
-                },
-                {
-                    "start_positions": tf.TensorShape([]),
-                    "end_positions": tf.TensorShape([]),
-                    "cls_index": tf.TensorShape([]),
-                    "p_mask": tf.TensorShape([None]),
-                    "is_impossible": tf.TensorShape([]),
-                },
-            )
-        else:
-            train_types = (
-                {"input_ids": tf.int32, "attention_mask": tf.int32, "feature_index": tf.int64, "qas_id": tf.string},
-                {
-                    "start_positions": tf.int64,
-                    "end_positions": tf.int64,
-                    "cls_index": tf.int64,
-                    "p_mask": tf.int32,
-                    "is_impossible": tf.int32,
-                },
-            )
-
-            train_shapes = (
-                {
-                    "input_ids": tf.TensorShape([None]),
-                    "attention_mask": tf.TensorShape([None]),
-                    "feature_index": tf.TensorShape([]),
-                    "qas_id": tf.TensorShape([]),
-                },
-                {
-                    "start_positions": tf.TensorShape([]),
-                    "end_positions": tf.TensorShape([]),
-                    "cls_index": tf.TensorShape([]),
-                    "p_mask": tf.TensorShape([None]),
-                    "is_impossible": tf.TensorShape([]),
-                },
-            )
-
-        return tf.data.Dataset.from_generator(gen, train_types, train_shapes)
-    else:
-        return features
-
-
-class SquadProcessor(DataProcessor):
-    """
-    Processor for the SQuAD data set. overridden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and
-    version 2.0 of SQuAD, respectively.
-    """
-
-    train_file = None
-    dev_file = None
-
-    def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
-        if not evaluate:
-            answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8")
-            answer_start = tensor_dict["answers"]["answer_start"][0].numpy()
-            answers = []
-        else:
-            answers = [
-                {"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")}
-                for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"])
-            ]
-
-            answer = None
-            answer_start = None
-
-        return SquadExample(
-            qas_id=tensor_dict["id"].numpy().decode("utf-8"),
-            question_text=tensor_dict["question"].numpy().decode("utf-8"),
-            context_text=tensor_dict["context"].numpy().decode("utf-8"),
-            answer_text=answer,
-            start_position_character=answer_start,
-            title=tensor_dict["title"].numpy().decode("utf-8"),
-            answers=answers,
-        )
-
-    def get_examples_from_dataset(self, dataset, evaluate=False):
-        """
-        Creates a list of [`~data.processors.squad.SquadExample`] using a TFDS dataset.
-
-        Args:
-            dataset: The tfds dataset loaded from *tensorflow_datasets.load("squad")*
-            evaluate: Boolean specifying if in evaluation mode or in training mode
-
-        Returns:
-            List of SquadExample
-
-        Examples:
-
-        ```python
-        >>> import tensorflow_datasets as tfds
-
-        >>> dataset = tfds.load("squad")
-
-        >>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
-        >>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
-        ```"""
-
-        if evaluate:
-            dataset = dataset["validation"]
-        else:
-            dataset = dataset["train"]
-
-        examples = []
-        for tensor_dict in tqdm(dataset):
-            examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
-
-        return examples
-
-    def get_train_examples(self, data_dir, filename=None):
-        """
-        Returns the training examples from the data directory.
-
-        Args:
-            data_dir: Directory containing the data files used for training and evaluating.
-            filename: None by default, specify this if the training file has a different name than the original one
-                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
-
-        """
-        if data_dir is None:
-            data_dir = ""
-
-        if self.train_file is None:
-            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
-
-        with open(
-            os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8"
-        ) as reader:
-            input_data = json.load(reader)["data"]
-        return self._create_examples(input_data, "train")
-
-    def get_dev_examples(self, data_dir, filename=None):
-        """
-        Returns the evaluation example from the data directory.
-
-        Args:
-            data_dir: Directory containing the data files used for training and evaluating.
-            filename: None by default, specify this if the evaluation file has a different name than the original one
-                which is `dev-v1.1.json` and `dev-v2.0.json` for squad versions 1.1 and 2.0 respectively.
-        """
-        if data_dir is None:
-            data_dir = ""
-
-        if self.dev_file is None:
-            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
-
-        with open(
-            os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8"
-        ) as reader:
-            input_data = json.load(reader)["data"]
-        return self._create_examples(input_data, "dev")
-
-    def _create_examples(self, input_data, set_type):
-        is_training = set_type == "train"
-        examples = []
-        for entry in tqdm(input_data):
-            title = entry["title"]
-            for paragraph in entry["paragraphs"]:
-                context_text = paragraph["context"]
-                for qa in paragraph["qas"]:
-                    qas_id = qa["id"]
-                    question_text = qa["question"]
-                    start_position_character = None
-                    answer_text = None
-                    answers = []
-
-                    is_impossible = qa.get("is_impossible", False)
-                    if not is_impossible:
-                        if is_training:
-                            answer = qa["answers"][0]
-                            answer_text = answer["text"]
-                            start_position_character = answer["answer_start"]
-                        else:
-                            answers = qa["answers"]
-
-                    example = SquadExample(
-                        qas_id=qas_id,
-                        question_text=question_text,
-                        context_text=context_text,
-                        answer_text=answer_text,
-                        start_position_character=start_position_character,
-                        title=title,
-                        is_impossible=is_impossible,
-                        answers=answers,
-                    )
-                    examples.append(example)
-        return examples
-
-
-class SquadV1Processor(SquadProcessor):
-    train_file = "train-v1.1.json"
-    dev_file = "dev-v1.1.json"
-
-
-class SquadV2Processor(SquadProcessor):
-    train_file = "train-v2.0.json"
-    dev_file = "dev-v2.0.json"
-
-
-class SquadExample:
-    """
-    A single training/test example for the Squad dataset, as loaded from disk.
-
-    Args:
-        qas_id: The example's unique identifier
-        question_text: The question string
-        context_text: The context string
-        answer_text: The answer string
-        start_position_character: The character position of the start of the answer
-        title: The title of the example
-        answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
-        is_impossible: False by default, set to True if the example has no possible answer.
-    """
-
-    def __init__(
-        self,
-        qas_id,
-        question_text,
-        context_text,
-        answer_text,
-        start_position_character,
-        title,
-        answers=[],
-        is_impossible=False,
-    ):
-        self.qas_id = qas_id
-        self.question_text = question_text
-        self.context_text = context_text
-        self.answer_text = answer_text
-        self.title = title
-        self.is_impossible = is_impossible
-        self.answers = answers
-
-        self.start_position, self.end_position = 0, 0
-
-        doc_tokens = []
-        char_to_word_offset = []
-        prev_is_whitespace = True
-
-        # Split on whitespace so that different tokens may be attributed to their original position.
-        for c in self.context_text:
-            if _is_whitespace(c):
-                prev_is_whitespace = True
-            else:
-                if prev_is_whitespace:
-                    doc_tokens.append(c)
-                else:
-                    doc_tokens[-1] += c
-                prev_is_whitespace = False
-            char_to_word_offset.append(len(doc_tokens) - 1)
-
-        self.doc_tokens = doc_tokens
-        self.char_to_word_offset = char_to_word_offset
-
-        # Start and end positions only has a value during evaluation.
-        if start_position_character is not None and not is_impossible:
-            self.start_position = char_to_word_offset[start_position_character]
-            self.end_position = char_to_word_offset[
-                min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
-            ]
-
-
-class SquadFeatures:
-    """
-    Single squad example features to be fed to a model. Those features are model-specific and can be crafted from
-    [`~data.processors.squad.SquadExample`] using the
-    :method:*~transformers.data.processors.squad.squad_convert_examples_to_features* method.
-
-    Args:
-        input_ids: Indices of input sequence tokens in the vocabulary.
-        attention_mask: Mask to avoid performing attention on padding token indices.
-        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
-        cls_index: the index of the CLS token.
-        p_mask: Mask identifying tokens that can be answers vs. tokens that cannot.
-            Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer
-        example_index: the index of the example
-        unique_id: The unique Feature identifier
-        paragraph_len: The length of the context
-        token_is_max_context:
-            List of booleans identifying which tokens have their maximum context in this feature object. If a token
-            does not have their maximum context in this feature object, it means that another feature object has more
-            information related to that token and should be prioritized over this feature for that token.
-        tokens: list of tokens corresponding to the input ids
-        token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
-        start_position: start of the answer token index
-        end_position: end of the answer token index
-        encoding: optionally store the BatchEncoding with the fast-tokenizer alignment methods.
-    """
-
-    def __init__(
-        self,
-        input_ids,
-        attention_mask,
-        token_type_ids,
-        cls_index,
-        p_mask,
-        example_index,
-        unique_id,
-        paragraph_len,
-        token_is_max_context,
-        tokens,
-        token_to_orig_map,
-        start_position,
-        end_position,
-        is_impossible,
-        qas_id: str = None,
-        encoding: BatchEncoding = None,
-    ):
-        self.input_ids = input_ids
-        self.attention_mask = attention_mask
-        self.token_type_ids = token_type_ids
-        self.cls_index = cls_index
-        self.p_mask = p_mask
-
-        self.example_index = example_index
-        self.unique_id = unique_id
-        self.paragraph_len = paragraph_len
-        self.token_is_max_context = token_is_max_context
-        self.tokens = tokens
-        self.token_to_orig_map = token_to_orig_map
-
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-        self.qas_id = qas_id
-
-        self.encoding = encoding
-
-
-class SquadResult:
-    """
-    Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.
-
-    Args:
-        unique_id: The unique identifier corresponding to that example.
-        start_logits: The logits corresponding to the start of the answer
-        end_logits: The logits corresponding to the end of the answer
-    """
-
-    def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
-        self.start_logits = start_logits
-        self.end_logits = end_logits
-        self.unique_id = unique_id
-
-        if start_top_index:
-            self.start_top_index = start_top_index
-            self.end_top_index = end_top_index
-            self.cls_logits = cls_logits
diff --git a/bertserini/utils/utils.py b/bertserini/utils/utils.py
index 5a3d2e4..ccca0e1 100644
--- a/bertserini/utils/utils.py
+++ b/bertserini/utils/utils.py
@@ -5,6 +5,10 @@
 import re
 import zhon
 import numpy as np
+from hanziconv import HanziConv
+
+from bertserini.reader.base import Question
+from bertserini.utils.utils import strip_accents
 
 
 def strip_accents(text):
@@ -174,3 +178,25 @@ def remove_punc(text):
         return ''.join(ch for ch in text if ch not in exclude)
 
     return remove_punc(s)
+
+def get_best_answer(candidates, weight=0.5):
+    for ans in candidates:
+        ans.aggregate_score(weight)
+    return sorted(candidates, key=lambda x: x.total_score, reverse=True)[0]
+
+
+def extract_squad_questions(squad_filename, do_strip_accents=False, language="en"):
+    data = json.load(open(squad_filename, 'r'))
+    data = data["data"]
+    questions = []
+    for article in data:
+        for paragraph in article["paragraphs"]:
+            for qa in paragraph["qas"]:
+                id_ = qa["id"]
+                question = qa["question"]
+                if do_strip_accents:
+                    question = strip_accents(question)
+                if language == "zh":
+                    HanziConv.toSimplified(question)
+                questions.append(Question(question, id_, language))
+    return questions
diff --git a/bertserini/utils/utils_new.py b/bertserini/utils/utils_new.py
deleted file mode 100644
index 09ee01f..0000000
--- a/bertserini/utils/utils_new.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import json
-from hanziconv import HanziConv
-
-from bertserini.reader.base import Question
-from bertserini.utils.utils import strip_accents
-
-
-def get_best_answer(candidates, weight=0.5):
-    for ans in candidates:
-        ans.aggregate_score(weight)
-    return sorted(candidates, key=lambda x: x.total_score, reverse=True)[0]
-
-
-def extract_squad_questions(squad_filename, do_strip_accents=False, language="en"):
-    data = json.load(open(squad_filename, 'r'))
-    data = data["data"]
-    questions = []
-    for article in data:
-        for paragraph in article["paragraphs"]:
-            for qa in paragraph["qas"]:
-                id_ = qa["id"]
-                question = qa["question"]
-                if do_strip_accents:
-                    question = strip_accents(question)
-                if language == "zh":
-                    HanziConv.toSimplified(question)
-                questions.append(Question(question, id_, language))
-    return questions
\ No newline at end of file
diff --git a/inference_test.py b/inference_test.py
index 182dd69..c8299d9 100644
--- a/inference_test.py
+++ b/inference_test.py
@@ -1,7 +1,7 @@
 from bertserini.reader.base import Question, Context
 from bertserini.reader.bert_reader import BERT
 from bertserini.reader.dpr_reader import DPR
-from bertserini.utils.utils_new import get_best_answer
+from bertserini.utils.utils import get_best_answer
 from bertserini.experiments.args import *
 from bertserini.retriever.pyserini_retriever import retriever, build_searcher
 

From 5c962a3066701d0c412f7d48ce1b7c039621f7fb Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Wed, 24 Aug 2022 20:13:27 +0000
Subject: [PATCH 48/50] fix minor bug

---
 bertserini/utils/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bertserini/utils/utils.py b/bertserini/utils/utils.py
index ccca0e1..52020b5 100644
--- a/bertserini/utils/utils.py
+++ b/bertserini/utils/utils.py
@@ -8,7 +8,7 @@
 from hanziconv import HanziConv
 
 from bertserini.reader.base import Question
-from bertserini.utils.utils import strip_accents
+# from bertserini.utils.utils import strip_accents
 
 
 def strip_accents(text):

From 4ef5e5eae634ea346fca6fb21a5f77f404885c29 Mon Sep 17 00:00:00 2001
From: amyxie361 <amyxie361@outlook.com>
Date: Wed, 24 Aug 2022 22:22:39 +0000
Subject: [PATCH 49/50] clean up code and tqdm

---
 bertserini/experiments/inference.py |  6 +++---
 bertserini/reader/bert_reader.py    | 22 ++++------------------
 bertserini/utils/utils.py           |  2 --
 bertserini/utils/utils_qa.py        |  4 ++--
 4 files changed, 9 insertions(+), 25 deletions(-)

diff --git a/bertserini/experiments/inference.py b/bertserini/experiments/inference.py
index c7c9ad9..a88e622 100644
--- a/bertserini/experiments/inference.py
+++ b/bertserini/experiments/inference.py
@@ -14,11 +14,11 @@
 
     all_answer = []
     for question in tqdm(questions):
-        print("before retriever:", time.time())
+        # print("before retriever:", time.time())
         contexts = retriever(question, searcher, args.topk)
-        print("before reader:", time.time())
+        # print("before reader:", time.time())
         final_answers = bert_reader.predict(question, contexts)
-        print("after reader:", time.time())
+        # print("after reader:", time.time())
         final_answers_lst = []
         for ans in final_answers:
             final_answers_lst.append(
diff --git a/bertserini/reader/bert_reader.py b/bertserini/reader/bert_reader.py
index 0582a3b..173193a 100644
--- a/bertserini/reader/bert_reader.py
+++ b/bertserini/reader/bert_reader.py
@@ -9,6 +9,8 @@
 from bertserini.utils.utils_qa import postprocess_qa_predictions
 from bertserini.reader.base import Reader, Question, Context, Answer
 
+from datasets.utils import logging
+
 __all__ = ['BERT']
 
 class BERT(Reader):
@@ -41,9 +43,8 @@ def update_args(self, args_to_change):
         for key in args_to_change:
             self.args[key] = args_to_change[key]
 
-
-
     def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
+        logging.disable_progress_bar()
 
         def prepare_validation_features(examples):
             question_column_name = "question"
@@ -92,7 +93,6 @@ def prepare_validation_features(examples):
                     (o if sequence_ids[k] == context_index else None)
                     for k, o in enumerate(tokenized_examples["offset_mapping"][i])
                 ]
-            # print(tokenized_examples)
             return tokenized_examples
 
         def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
@@ -138,24 +138,11 @@ def post_processing_function(examples, features, predictions, stage="eval"):
                 max_answer_length=self.args["max_answer_length"],
                 null_score_diff_threshold=self.args["null_score_diff_threshold"],
                 output_dir="./tmp/",
-                # output_dir=self.args["output_dir"],
-                # log_level=log_level,
                 prefix=stage,
             )
-            # print(predictions)
-            # print(all_nbest_json)
-            # Format the result to the format the metric expects.
-            # if self.args["version_2_with_negative"]:
-            #     formatted_predictions = [
-            #         {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
-            #     ]
-            # else:
-            #     formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
-
-            # references = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
-            # return EvalPrediction(predictions=formatted_predictions)#, label_ids=references)
             return all_nbest_json
 
+
         inputs = {"question": [], "context": [], "id": []}
         for i, ctx in enumerate(contexts):
             inputs["question"].append(question.text)
@@ -163,7 +150,6 @@ def post_processing_function(examples, features, predictions, stage="eval"):
             inputs["id"].append(i)
         eval_examples = Dataset.from_dict(inputs)
         column_names = eval_examples.column_names
-
         eval_dataset = eval_examples.map(
             prepare_validation_features,
             batched=True,
diff --git a/bertserini/utils/utils.py b/bertserini/utils/utils.py
index 52020b5..ad4f2b6 100644
--- a/bertserini/utils/utils.py
+++ b/bertserini/utils/utils.py
@@ -8,8 +8,6 @@
 from hanziconv import HanziConv
 
 from bertserini.reader.base import Question
-# from bertserini.utils.utils import strip_accents
-
 
 def strip_accents(text):
     return "".join(char for char in unicodedata.normalize('NFKD', text)
diff --git a/bertserini/utils/utils_qa.py b/bertserini/utils/utils_qa.py
index a73cd0a..dedcd52 100644
--- a/bertserini/utils/utils_qa.py
+++ b/bertserini/utils/utils_qa.py
@@ -97,7 +97,7 @@ def postprocess_qa_predictions(
     logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
 
     # Let's loop over all the examples!
-    for example_index, example in enumerate(tqdm(examples)):
+    for example_index, example in enumerate(examples):
         # Those are the indices of the features associated to the current example.
         feature_indices = features_per_example[example_index]
 
@@ -312,7 +312,7 @@ def postprocess_qa_predictions_with_beam_search(
     logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
 
     # Let's loop over all the examples!
-    for example_index, example in enumerate(tqdm(examples)):
+    for example_index, example in enumerate(examples):
         # Those are the indices of the features associated to the current example.
         feature_indices = features_per_example[example_index]
 

From 3ffd1e08a130e013b7e9bb69b1b65be44efcb563 Mon Sep 17 00:00:00 2001
From: AileenLin <aileenlulin@gmail.com>
Date: Mon, 19 Sep 2022 20:31:37 +0000
Subject: [PATCH 50/50] add T5 reader and corresponding test

---
 bertserini/reader/t5_reader.py | 208 +++++++++++++++++++++++++++++++++
 inference_test.py              |  11 +-
 2 files changed, 217 insertions(+), 2 deletions(-)
 create mode 100644 bertserini/reader/t5_reader.py

diff --git a/bertserini/reader/t5_reader.py b/bertserini/reader/t5_reader.py
new file mode 100644
index 0000000..347181a
--- /dev/null
+++ b/bertserini/reader/t5_reader.py
@@ -0,0 +1,208 @@
+from typing import List
+
+import torch
+from torch.utils.data import DataLoader, SequentialSampler
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, default_data_collator, EvalPrediction
+import datasets
+from datasets import Dataset
+import numpy as np
+from typing import List, Optional, Tuple
+
+from bertserini.reader.base import Reader, Question, Context, Answer
+
+from datasets.utils import logging
+
+__all__ = ['T5']
+class T5(Reader):
+    def __init__(self, args):
+        self.model_args = args
+        if self.model_args.tokenizer_name is None:
+            self.model_args.tokenizer_name = self.model_args.model_name_or_path
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_args.model_name_or_path).to(self.device).eval()
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_args.tokenizer_name, do_lower_case=True)
+        self.question_column = 'question'
+        self.context_column = 'context'
+        self.answer_column = 'answers'
+        '''
+          --per_device_train_batch_size 4 \
+          --per_device_eval_batch_size 1 \
+          --output_dir ./models/s2s_squad2_0train/ \
+          --eval_accumulation_steps 1 \
+          --predict_with_generate \
+        '''
+        self.args = {
+            "max_seq_length": 384,
+            "doc_stride": 128,
+            "max_query_length": 64,
+            "threads": 1,
+            "tqdm_enabled": False,
+            "n_best_size": 20,
+            "max_answer_length": 384,
+            "do_lower_case": True,
+            "output_prediction_file": False,
+            "output_nbest_file": self.model_args.output_nbest_file,
+            "output_null_log_odds_file": None,
+            "verbose_logging": False,
+            "version_2_with_negative": True,
+            "null_score_diff_threshold": 0,
+            "ignore_pad_token_for_loss": True
+        }
+
+    def update_args(self, args_to_change):
+        for key in args_to_change:
+            self.args[key] = args_to_change[key]
+
+    def predict(self, question: Question, contexts: List[Context]) -> List[Answer]:
+        logging.disable_progress_bar()
+
+        def preprocess_squad_batch(
+                examples,
+                question_column: str,
+                context_column: str,
+                answer_column: str,
+        ) -> Tuple[List[str], List[str]]:
+            questions = examples[question_column]
+            contexts = examples[context_column]
+            answers = examples.get(answer_column,[])
+
+            def generate_input(_question, _context):
+                return " ".join(["question:", _question.lstrip(), "context:", _context.lstrip()])
+
+            inputs = [generate_input(question, context) for question, context in zip(questions, contexts)]
+            targets = [answer["text"][0] if len(answer["text"]) > 0 else "" for answer in answers]
+            return inputs, targets
+
+        def preprocess_function(examples):
+            inputs, targets = preprocess_squad_batch(examples, self.question_column, self.context_column, self.answer_column)
+
+            model_inputs = self.tokenizer(inputs, max_length=self.args["max_seq_length"], padding='max_length', truncation=True)
+            # Setup the tokenizer for targets
+            with self.tokenizer.as_target_tokenizer():
+                labels = self.tokenizer(targets, max_length=self.args['max_answer_length'], padding='max_length', truncation=True)
+
+            # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
+            # padding in the loss.
+            if self.args['ignore_pad_token_for_loss']:
+                labels["input_ids"] = [
+                    [(l if l != self.tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
+                ]
+
+            model_inputs["labels"] = labels["input_ids"]
+            return model_inputs
+
+        # Validation preprocessing
+        def preprocess_validation_function(examples):
+            inputs, targets = preprocess_squad_batch(examples, self.question_column, self.context_column, self.answer_column)
+
+            model_inputs = self.tokenizer(
+                inputs,
+                max_length=self.args["max_seq_length"],
+                padding='max_length',
+                truncation=True,
+                return_offsets_mapping=True,
+            )
+
+            if targets:
+                # Setup the tokenizer for targets
+                with self.tokenizer.as_target_tokenizer():
+                    labels = self.tokenizer(targets, max_length=self.args['max_answer_length'], padding='max_length', truncation=True)
+
+            # Since one example might give us several features if it has a long context, we need a map from a feature to
+            # its corresponding example. This key gives us just that.
+            # sample_mapping = model_inputs.pop("overflow_to_sample_mapping")
+            sample_mapping = list(range(len(model_inputs["input_ids"])))
+
+            # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+            # corresponding example_id and we will store the offset mappings.
+            model_inputs["example_id"] = []
+
+            for i in range(len(model_inputs["input_ids"])):
+                # One example can give several spans, this is the index of the example containing this span of text.
+                sample_index = sample_mapping[i]
+                model_inputs["example_id"].append(examples["id"][sample_index])
+
+            if targets:
+                # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
+                # padding in the loss.
+                if self.args['ignore_pad_token_for_loss']:
+                    labels["input_ids"] = [
+                        [(l if l != self.tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
+                    ]
+
+                model_inputs["labels"] = labels["input_ids"]
+
+            return model_inputs
+
+        def post_processing_function(examples: datasets.Dataset, features: datasets.Dataset, outputs, stage="eval"):
+            # Decode the predicted tokens.
+            decoded_preds = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
+
+            # Build a map example to its corresponding features.
+            example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+            feature_per_example = {example_id_to_index[feature["example_id"]]: i for i, feature in enumerate(features)}
+            predictions = {}
+            # Let's loop over all the examples!
+            for example_index, example in enumerate(examples):
+                # This is the index of the feature associated to the current example.
+                feature_index = feature_per_example[example_index]
+                predictions[example["id"]] = decoded_preds[feature_index]
+
+            # Format the result to the format the metric expects.
+            if self.args['version_2_with_negative']:
+                formatted_predictions = [
+                    {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
+                ]
+            else:
+                formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
+
+            # references = [{"id": ex["id"], "answers": ex[self.answer_column]} for ex in examples]
+            # return EvalPrediction(predictions=formatted_predictions, label_ids=references)
+            return formatted_predictions
+
+
+
+        inputs = {"question": [], "context": [], "id": []}
+        for i, ctx in enumerate(contexts):
+            inputs["question"].append(question.text)
+            inputs["context"].append(contexts[i].text)
+            inputs["id"].append(i)
+        print(inputs)
+        eval_examples = Dataset.from_dict(inputs)
+        column_names = eval_examples.column_names
+        eval_dataset = eval_examples.map(
+            preprocess_validation_function,
+            batched=True,
+            num_proc=1,
+            remove_columns=column_names,
+        )
+
+        eval_dataset_for_model = eval_dataset.remove_columns(["example_id", "offset_mapping"])
+
+        eval_dataloader = DataLoader(
+            eval_dataset_for_model,
+            collate_fn=default_data_collator,
+            batch_size=self.model_args.eval_batch_size,
+        )
+        raw_predict = []
+        for batch in eval_dataloader:
+            for k in batch:
+                batch[k] = batch[k].to(self.device)
+            outs = self.model.generate(input_ids=batch['input_ids'],
+                                  attention_mask=batch['attention_mask'],
+                                  max_length=16,
+                                  early_stopping=True)
+            raw_predict.extend(outs)
+        all_nbest_json = post_processing_function(eval_examples, eval_dataset, raw_predict)
+
+        all_answers = []
+        for item in all_nbest_json:
+            all_answers.append(Answer(
+                text=item["prediction_text"],
+                score=0.0,
+                # score=all_nbest_json[ans][0]["start_logit"] + all_nbest_json[ans][0]["end_logit"],
+                ctx_score=contexts[item['id']].score,
+                language=question.language
+            ))
+        return all_answers
+
diff --git a/inference_test.py b/inference_test.py
index c8299d9..edc06a7 100644
--- a/inference_test.py
+++ b/inference_test.py
@@ -1,11 +1,12 @@
 from bertserini.reader.base import Question, Context
 from bertserini.reader.bert_reader import BERT
 from bertserini.reader.dpr_reader import DPR
+from bertserini.reader.t5_reader import T5
 from bertserini.utils.utils import get_best_answer
 from bertserini.experiments.args import *
 from bertserini.retriever.pyserini_retriever import retriever, build_searcher
 
-ENG_reader = "BERT"
+ENG_reader = "T5"
 do_local_test = True
 do_bm25_test = True
 do_dpr_test = False
@@ -21,8 +22,13 @@
     args.tokenizer_name = "facebook/dpr-reader-multiset-base"
     bert_reader = DPR(args)
 
+elif ENG_reader == "T5":
+    args.model_name_or_path = "/data/aileen/workspace/t5_test2/models/gpu/checkpoint-10500"
+    args.tokenizer_name = "t5-base"
+    bert_reader = T5(args)
+
 # question = Question("Why did Mark Twain call the 19th century the glied age?")
-question = Question("Where is the capital of China?")
+question = Question("What is the capital city of China?")
 
 print(question.text)
 
@@ -40,6 +46,7 @@
     searcher = build_searcher(args)
     contexts = retriever(question, searcher, 10)
     candidates = bert_reader.predict(question, contexts)
+    print(candidates)
     answer = get_best_answer(candidates, 0.45)
     print("Answer:", answer.text) # todo: no context returned. is the context included? maybe update to another question
     # print("BM25 Test Passed")