Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,22 @@ We demonstrate an end-to-end Open-Domain question answering system that integrat
Following the Open Domain QA setting of DrQA, we are using Wikipedia as the large scale knowledge source of documents. The system first retrieves several candidate text segmentations among the entire knowledge source of documents, then read through the candidate text segments to determine the answers.

## Package Installation

```
pip install bertserini
conda create -n bertserini python==3.8.0
conda activate bertserini
conda install tqdm
pip install transformers==4.17
pip install pyserini==0.17.0
conda install -c pytorch faiss-gpu
pip install hanziconv
pip install zhon
pip install tensorboard
```

Also, install pytorch following instructions here: https://pytorch.org/get-started/locally/


## Development Installation
BERTserini requires Python 3.6+ and a couple Python dependencies.
The repo is tested on Python 3.6, Cuda 10.1, PyTorch 1.5.1 on Tesla P40 GPUs.
Expand All @@ -33,7 +45,7 @@ Below is a example for English Question-Answering. We also provide an example fo
```python
from bertserini.reader.base import Question, Context
from bertserini.reader.bert_reader import BERT
from bertserini.utils.utils_new import get_best_answer
from bertserini.utils.utils import get_best_answer

model_name = "rsvp-ai/bertserini-bert-base-squad"
tokenizer_name = "rsvp-ai/bertserini-bert-base-squad"
Expand Down
53 changes: 48 additions & 5 deletions bertserini/experiments/eval/evaluate_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@
import argparse
import json

from rouge_metric import PyRouge
rouge = PyRouge(rouge_n=(2,), rouge_su=True, skip_gap=4)
#from rouge_score import rouge_scorer
#rouge1_scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
#rougel_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

from bertserini.utils.utils import normalize_answer, init_logger

logger = init_logger("evluation")
Expand Down Expand Up @@ -67,6 +73,32 @@ def overlap_score(prediction, ground_truth):
def exact_match_score(prediction, ground_truth):
return normalize_answer(prediction) == normalize_answer(ground_truth)

def rouge2_r_score(prediction, ground_truth):
if len(prediction) == 0:
return 0
return rouge.evaluate([ground_truth], [[prediction]])["rouge-2"]["r"]
#return rouge1_scorer.score(prediction, ground_truth)

def rouge2_f_score(prediction, ground_truth):
if len(prediction) == 0:
return 0
return rouge.evaluate([ground_truth], [[prediction]])["rouge-2"]["f"]

def rougesu4_r_score(prediction, ground_truth):
if len(prediction) == 0:
return 0
return rouge.evaluate([ground_truth], [[prediction]])["rouge-su4"]["r"]

def rougesu4_f_score(prediction, ground_truth):
if len(prediction) == 0:
return 0
return rouge.evaluate([ground_truth], [[prediction]])["rouge-su4"]["f"]

#def rougel_score(prediction, ground_truth):
# print(rougel_scorer.score(prediction, ground_truth))
# input()
# return rougel_scorer.score(prediction, ground_truth)


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
scores_for_ground_truths = []
Expand All @@ -92,7 +124,7 @@ def metric_max_recall(metric_fn, prediction, ground_truths):


def evaluate(dataset, predictions):
sentence_cover = precision = cover = sentence_recall = recall = f1 = exact_match = total = overlap = 0
sentence_cover = precision = cover = sentence_recall = recall = f1 = exact_match = total = overlap = rouge2_r = rouge2_f = rougesu4_r = rougesu4_f = 0
for article in dataset:
for paragraph in article['paragraphs']:
for qa in paragraph['qas']:
Expand All @@ -104,6 +136,11 @@ def evaluate(dataset, predictions):
ground_truths = list(map(lambda x: x['text'], qa['answers']))
prediction = [predictions[qa['id']]]
#prediction_sentence = predictions[qa['id']]['sentences']
rouge2_r += metric_max_recall(rouge2_r_score, prediction, ground_truths)
rouge2_f += metric_max_recall(rouge2_f_score, prediction, ground_truths)
rougesu4_r += metric_max_recall(rougesu4_r_score, prediction, ground_truths)
rougesu4_f += metric_max_recall(rougesu4_f_score, prediction, ground_truths)
#rougel += metric_max_recall(rougel_score, prediction, ground_truths)
cover += metric_max_recall(cover_score, prediction, ground_truths)
exact_match += metric_max_recall(
exact_match_score, prediction, ground_truths)
Expand All @@ -124,21 +161,27 @@ def evaluate(dataset, predictions):
overlap = 100.0 * overlap / total
cover = 100.0 * cover / total
precision = 100.0 * precision / total
rouge2_r = 100.0 * rouge2_r / total
rouge2_f = 100.0 * rouge2_f / total
rougesu4_r = 100.0 * rougesu4_r / total
rougesu4_f = 100.0 * rougesu4_f / total
#rougel = 100.0 * rougel / total
#sentence_recall = 100.0 * sentence_recall / total
#sentence_cover = 100.0 * sentence_cover / total

return {'exact_match': exact_match, 'f1': f1, "recall": recall,
#"sentence_recall": sentence_recall, "sentence_cover": sentence_cover,
"precision": precision, "cover": cover, "overlap": overlap}
"precision": precision, "cover": cover, "overlap": overlap,
"rouge2_r": rouge2_r, "rouge2_f":rouge2_f, "rougesu4_r": rougesu4_r, "rougesu4_f": rougesu4_f}


def squad_v1_eval(dataset_filename, prediction_filename):
expected_version = '1.1'
with open(dataset_filename) as dataset_file:
dataset_json = json.load(dataset_file)
if dataset_json['version'] != expected_version:
logger.error('Evaluation expects v-{}, but got dataset with v-{}'.format(
expected_version, dataset_json['version']))
#if dataset_json['version'] != expected_version:
# logger.error('Evaluation expects v-{}, but got dataset with v-{}'.format(
# expected_version, dataset_json['version']))
dataset = dataset_json['data']
with open(prediction_filename) as prediction_file:
predictions = json.load(prediction_file)
Expand Down
6 changes: 5 additions & 1 deletion bertserini/experiments/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
from tqdm import tqdm
from bertserini.reader.bert_reader import BERT
from bertserini.retriever.pyserini_retriever import retriever, build_searcher
from bertserini.utils.utils_new import extract_squad_questions
from bertserini.utils.utils import extract_squad_questions
from bertserini.experiments.args import *
import time

if __name__ == "__main__":
questions = extract_squad_questions(args.dataset_path, do_strip_accents=args.strip_accents)
Expand All @@ -13,8 +14,11 @@

all_answer = []
for question in tqdm(questions):
# print("before retriever:", time.time())
contexts = retriever(question, searcher, args.topk)
# print("before reader:", time.time())
final_answers = bert_reader.predict(question, contexts)
# print("after reader:", time.time())
final_answers_lst = []
for ans in final_answers:
final_answers_lst.append(
Expand Down
Loading