amyxie361 · amyxie361 · Oct 13, 2022 · Mar 30, 2022 · Aug 18, 2022 · Aug 23, 2022
diff --git a/README.md b/README.md
@@ -11,10 +11,22 @@ We demonstrate an end-to-end Open-Domain question answering system that integrat
 Following the Open Domain QA setting of DrQA, we are using Wikipedia as the large scale knowledge source of documents. The system first retrieves several candidate text segmentations among the entire knowledge source of documents, then read through the candidate text segments to determine the answers.
 
 ## Package Installation
+
 ```
-pip install bertserini
+conda create -n bertserini python==3.8.0
+conda activate bertserini
+conda install tqdm
+pip install transformers==4.17
+pip install pyserini==0.17.0
+conda install -c pytorch faiss-gpu
+pip install hanziconv
+pip install zhon
+pip install tensorboard
 ```
 
+Also, install pytorch following instructions here: https://pytorch.org/get-started/locally/
+
+
 ## Development Installation
 BERTserini requires Python 3.6+ and a couple Python dependencies. 
 The repo is tested on Python 3.6, Cuda 10.1, PyTorch 1.5.1 on Tesla P40 GPUs.
@@ -33,7 +45,7 @@ Below is a example for English Question-Answering. We also provide an example fo
 ```python
 from bertserini.reader.base import Question, Context
 from bertserini.reader.bert_reader import BERT
-from bertserini.utils.utils_new import get_best_answer
+from bertserini.utils.utils import get_best_answer
 
 model_name = "rsvp-ai/bertserini-bert-base-squad"
 tokenizer_name = "rsvp-ai/bertserini-bert-base-squad"

diff --git a/bertserini/experiments/eval/evaluate_v1.py b/bertserini/experiments/eval/evaluate_v1.py
@@ -4,6 +4,12 @@
 import argparse
 import json
 
+from rouge_metric import PyRouge
+rouge = PyRouge(rouge_n=(2,), rouge_su=True, skip_gap=4)
+#from rouge_score import rouge_scorer
+#rouge1_scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
+#rougel_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
+
 from bertserini.utils.utils import normalize_answer, init_logger
 
 logger = init_logger("evluation")
@@ -67,6 +73,32 @@ def overlap_score(prediction, ground_truth):
 def exact_match_score(prediction, ground_truth):
     return normalize_answer(prediction) == normalize_answer(ground_truth)
 
+def rouge2_r_score(prediction, ground_truth):
+    if len(prediction) == 0:
+        return 0
+    return rouge.evaluate([ground_truth], [[prediction]])["rouge-2"]["r"]
+    #return rouge1_scorer.score(prediction, ground_truth)
+
+def rouge2_f_score(prediction, ground_truth):
+    if len(prediction) == 0:
+        return 0
+    return rouge.evaluate([ground_truth], [[prediction]])["rouge-2"]["f"]
+
+def rougesu4_r_score(prediction, ground_truth):
+    if len(prediction) == 0:
+        return 0
+    return rouge.evaluate([ground_truth], [[prediction]])["rouge-su4"]["r"]
+
+def rougesu4_f_score(prediction, ground_truth):
+    if len(prediction) == 0:
+        return 0
+    return rouge.evaluate([ground_truth], [[prediction]])["rouge-su4"]["f"]
+
+#def rougel_score(prediction, ground_truth):
+#    print(rougel_scorer.score(prediction, ground_truth))
+#    input()
+#    return rougel_scorer.score(prediction, ground_truth)
+
 
 def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
     scores_for_ground_truths = []
@@ -92,7 +124,7 @@ def metric_max_recall(metric_fn, prediction, ground_truths):
 
 
 def evaluate(dataset, predictions):
-    sentence_cover = precision = cover = sentence_recall = recall = f1 = exact_match = total = overlap = 0
+    sentence_cover = precision = cover = sentence_recall = recall = f1 = exact_match = total = overlap = rouge2_r = rouge2_f = rougesu4_r = rougesu4_f = 0
     for article in dataset:
         for paragraph in article['paragraphs']:
             for qa in paragraph['qas']:
@@ -104,6 +136,11 @@ def evaluate(dataset, predictions):
                 ground_truths = list(map(lambda x: x['text'], qa['answers']))
                 prediction = [predictions[qa['id']]]
                 #prediction_sentence = predictions[qa['id']]['sentences']
+                rouge2_r += metric_max_recall(rouge2_r_score,  prediction, ground_truths)
+                rouge2_f += metric_max_recall(rouge2_f_score,  prediction, ground_truths)
+                rougesu4_r += metric_max_recall(rougesu4_r_score,  prediction, ground_truths)
+                rougesu4_f += metric_max_recall(rougesu4_f_score,  prediction, ground_truths)
+                #rougel += metric_max_recall(rougel_score,  prediction, ground_truths)
                 cover += metric_max_recall(cover_score, prediction, ground_truths)
                 exact_match += metric_max_recall(
                     exact_match_score, prediction, ground_truths)
@@ -124,21 +161,27 @@ def evaluate(dataset, predictions):
     overlap = 100.0 * overlap / total
     cover = 100.0 * cover / total
     precision = 100.0 * precision / total
+    rouge2_r = 100.0 * rouge2_r / total
+    rouge2_f = 100.0 * rouge2_f / total
+    rougesu4_r = 100.0 * rougesu4_r / total
+    rougesu4_f = 100.0 * rougesu4_f / total
+    #rougel = 100.0 * rougel / total
     #sentence_recall = 100.0 * sentence_recall / total
     #sentence_cover = 100.0 * sentence_cover / total
 
     return {'exact_match': exact_match, 'f1': f1, "recall": recall, 
             #"sentence_recall": sentence_recall, "sentence_cover": sentence_cover,
-            "precision": precision, "cover": cover, "overlap": overlap}
+            "precision": precision, "cover": cover, "overlap": overlap, 
+            "rouge2_r": rouge2_r, "rouge2_f":rouge2_f, "rougesu4_r": rougesu4_r, "rougesu4_f": rougesu4_f}
 
 
 def squad_v1_eval(dataset_filename, prediction_filename):
     expected_version = '1.1'
     with open(dataset_filename) as dataset_file:
         dataset_json = json.load(dataset_file)
-        if dataset_json['version'] != expected_version:
-            logger.error('Evaluation expects v-{}, but got dataset with v-{}'.format(
-                expected_version, dataset_json['version']))
+        #if dataset_json['version'] != expected_version:
+        #    logger.error('Evaluation expects v-{}, but got dataset with v-{}'.format(
+        #        expected_version, dataset_json['version']))
         dataset = dataset_json['data']
     with open(prediction_filename) as prediction_file:
         predictions = json.load(prediction_file)

diff --git a/bertserini/experiments/inference.py b/bertserini/experiments/inference.py
@@ -2,8 +2,9 @@
 from tqdm import tqdm
 from bertserini.reader.bert_reader import BERT
 from bertserini.retriever.pyserini_retriever import retriever, build_searcher
-from bertserini.utils.utils_new import extract_squad_questions
+from bertserini.utils.utils import extract_squad_questions
 from bertserini.experiments.args import *
+import time
 
 if __name__ == "__main__":
     questions = extract_squad_questions(args.dataset_path, do_strip_accents=args.strip_accents)
@@ -13,8 +14,11 @@
 
     all_answer = []
     for question in tqdm(questions):
+        # print("before retriever:", time.time())
         contexts = retriever(question, searcher, args.topk)
+        # print("before reader:", time.time())
         final_answers = bert_reader.predict(question, contexts)
+        # print("after reader:", time.time())
         final_answers_lst = []
         for ans in final_answers:
             final_answers_lst.append(