diff --git a/README.md b/README.md index 530687c..1028832 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ Below is a example for English Question-Answering. We also provide an example fo ```python from bertserini.reader.base import Question, Context from bertserini.reader.bert_reader import BERT -from bertserini.utils.utils_new import get_best_answer +from bertserini.utils.utils import get_best_answer model_name = "rsvp-ai/bertserini-bert-base-squad" tokenizer_name = "rsvp-ai/bertserini-bert-base-squad" diff --git a/bertserini/experiments/eval/evaluate_v1.py b/bertserini/experiments/eval/evaluate_v1.py index 8c32312..300cf9f 100755 --- a/bertserini/experiments/eval/evaluate_v1.py +++ b/bertserini/experiments/eval/evaluate_v1.py @@ -4,6 +4,12 @@ import argparse import json +from rouge_metric import PyRouge +rouge = PyRouge(rouge_n=(2,), rouge_su=True, skip_gap=4) +#from rouge_score import rouge_scorer +#rouge1_scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True) +#rougel_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) + from bertserini.utils.utils import normalize_answer, init_logger logger = init_logger("evluation") @@ -67,6 +73,32 @@ def overlap_score(prediction, ground_truth): def exact_match_score(prediction, ground_truth): return normalize_answer(prediction) == normalize_answer(ground_truth) +def rouge2_r_score(prediction, ground_truth): + if len(prediction) == 0: + return 0 + return rouge.evaluate([ground_truth], [[prediction]])["rouge-2"]["r"] + #return rouge1_scorer.score(prediction, ground_truth) + +def rouge2_f_score(prediction, ground_truth): + if len(prediction) == 0: + return 0 + return rouge.evaluate([ground_truth], [[prediction]])["rouge-2"]["f"] + +def rougesu4_r_score(prediction, ground_truth): + if len(prediction) == 0: + return 0 + return rouge.evaluate([ground_truth], [[prediction]])["rouge-su4"]["r"] + +def rougesu4_f_score(prediction, ground_truth): + if len(prediction) == 0: + return 0 + return rouge.evaluate([ground_truth], [[prediction]])["rouge-su4"]["f"] + +#def rougel_score(prediction, ground_truth): +# print(rougel_scorer.score(prediction, ground_truth)) +# input() +# return rougel_scorer.score(prediction, ground_truth) + def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): scores_for_ground_truths = [] @@ -92,7 +124,7 @@ def metric_max_recall(metric_fn, prediction, ground_truths): def evaluate(dataset, predictions): - sentence_cover = precision = cover = sentence_recall = recall = f1 = exact_match = total = overlap = 0 + sentence_cover = precision = cover = sentence_recall = recall = f1 = exact_match = total = overlap = rouge2_r = rouge2_f = rougesu4_r = rougesu4_f = 0 for article in dataset: for paragraph in article['paragraphs']: for qa in paragraph['qas']: @@ -104,6 +136,11 @@ def evaluate(dataset, predictions): ground_truths = list(map(lambda x: x['text'], qa['answers'])) prediction = [predictions[qa['id']]] #prediction_sentence = predictions[qa['id']]['sentences'] + rouge2_r += metric_max_recall(rouge2_r_score, prediction, ground_truths) + rouge2_f += metric_max_recall(rouge2_f_score, prediction, ground_truths) + rougesu4_r += metric_max_recall(rougesu4_r_score, prediction, ground_truths) + rougesu4_f += metric_max_recall(rougesu4_f_score, prediction, ground_truths) + #rougel += metric_max_recall(rougel_score, prediction, ground_truths) cover += metric_max_recall(cover_score, prediction, ground_truths) exact_match += metric_max_recall( exact_match_score, prediction, ground_truths) @@ -124,21 +161,27 @@ def evaluate(dataset, predictions): overlap = 100.0 * overlap / total cover = 100.0 * cover / total precision = 100.0 * precision / total + rouge2_r = 100.0 * rouge2_r / total + rouge2_f = 100.0 * rouge2_f / total + rougesu4_r = 100.0 * rougesu4_r / total + rougesu4_f = 100.0 * rougesu4_f / total + #rougel = 100.0 * rougel / total #sentence_recall = 100.0 * sentence_recall / total #sentence_cover = 100.0 * sentence_cover / total return {'exact_match': exact_match, 'f1': f1, "recall": recall, #"sentence_recall": sentence_recall, "sentence_cover": sentence_cover, - "precision": precision, "cover": cover, "overlap": overlap} + "precision": precision, "cover": cover, "overlap": overlap, + "rouge2_r": rouge2_r, "rouge2_f":rouge2_f, "rougesu4_r": rougesu4_r, "rougesu4_f": rougesu4_f} def squad_v1_eval(dataset_filename, prediction_filename): expected_version = '1.1' with open(dataset_filename) as dataset_file: dataset_json = json.load(dataset_file) - if dataset_json['version'] != expected_version: - logger.error('Evaluation expects v-{}, but got dataset with v-{}'.format( - expected_version, dataset_json['version'])) + #if dataset_json['version'] != expected_version: + # logger.error('Evaluation expects v-{}, but got dataset with v-{}'.format( + # expected_version, dataset_json['version'])) dataset = dataset_json['data'] with open(prediction_filename) as prediction_file: predictions = json.load(prediction_file) diff --git a/bertserini/experiments/inference.py b/bertserini/experiments/inference.py index a204218..a88e622 100644 --- a/bertserini/experiments/inference.py +++ b/bertserini/experiments/inference.py @@ -2,8 +2,9 @@ from tqdm import tqdm from bertserini.reader.bert_reader import BERT from bertserini.retriever.pyserini_retriever import retriever, build_searcher -from bertserini.utils.utils_new import extract_squad_questions +from bertserini.utils.utils import extract_squad_questions from bertserini.experiments.args import * +import time if __name__ == "__main__": questions = extract_squad_questions(args.dataset_path, do_strip_accents=args.strip_accents) @@ -13,8 +14,11 @@ all_answer = [] for question in tqdm(questions): + # print("before retriever:", time.time()) contexts = retriever(question, searcher, args.topk) + # print("before reader:", time.time()) final_answers = bert_reader.predict(question, contexts) + # print("after reader:", time.time()) final_answers_lst = [] for ans in final_answers: final_answers_lst.append( diff --git a/bertserini/reader/bert_reader.py b/bertserini/reader/bert_reader.py index b0256ce..173193a 100644 --- a/bertserini/reader/bert_reader.py +++ b/bertserini/reader/bert_reader.py @@ -2,33 +2,16 @@ import torch from torch.utils.data import DataLoader, SequentialSampler -from transformers import AutoTokenizer, AutoModelForQuestionAnswering, SquadExample, squad_convert_examples_to_features -from transformers.data.processors.squad import SquadResult +from transformers import AutoTokenizer, AutoModelForQuestionAnswering, default_data_collator, EvalPrediction +from datasets import Dataset +import numpy as np +from bertserini.utils.utils_qa import postprocess_qa_predictions from bertserini.reader.base import Reader, Question, Context, Answer -from bertserini.utils.utils_squad_metrics import compute_predictions_logits -__all__ = ['BERT'] - -from bertserini.train.run_squad import to_list - -def craft_squad_examples(question: Question, contexts: List[Context]) -> List[SquadExample]: - examples = [] - for idx, ctx in enumerate(contexts): - examples.append( - SquadExample( - qas_id=idx, - question_text=question.text, - context_text=ctx.text, - answer_text=None, - start_position_character=None, - title="", - is_impossible=False, - answers=[], - ) - ) - return examples +from datasets.utils import logging +__all__ = ['BERT'] class BERT(Reader): def __init__(self, args): @@ -37,7 +20,7 @@ def __init__(self, args): self.model_args.tokenizer_name = self.model_args.model_name_or_path self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.model = AutoModelForQuestionAnswering.from_pretrained(self.model_args.model_name_or_path).to(self.device).eval() - self.tokenizer = AutoTokenizer.from_pretrained(self.model_args.tokenizer_name, do_lower_case=True, use_fast=False) + self.tokenizer = AutoTokenizer.from_pretrained(self.model_args.tokenizer_name, do_lower_case=True) self.args = { "max_seq_length": 384, "doc_stride": 128, @@ -45,7 +28,7 @@ def __init__(self, args): "threads": 1, "tqdm_enabled": False, "n_best_size": 20, - "max_answer_length": 30, + "max_answer_length": 384, "do_lower_case": True, "output_prediction_file": False, "output_nbest_file": self.model_args.output_nbest_file, @@ -53,6 +36,7 @@ def __init__(self, args): "verbose_logging": False, "version_2_with_negative": True, "null_score_diff_threshold": 0, + "pad_on_right": False, } def update_args(self, args_to_change): @@ -60,77 +44,155 @@ def update_args(self, args_to_change): self.args[key] = args_to_change[key] def predict(self, question: Question, contexts: List[Context]) -> List[Answer]: - examples = craft_squad_examples(question, contexts) - - features, dataset = squad_convert_examples_to_features( - examples=examples, - tokenizer=self.tokenizer, - max_seq_length=self.args["max_seq_length"], - doc_stride=self.args["doc_stride"], - max_query_length=self.args["max_query_length"], - is_training=False, - return_dataset="pt", - threads=self.args["threads"], - tqdm_enabled=self.args["tqdm_enabled"] - ) + logging.disable_progress_bar() + + def prepare_validation_features(examples): + question_column_name = "question" + context_column_name = "context" + # answer_column_name = "answers" if "answers" in column_names else column_names[2] + # Some of the questions have lots of whitespace on the left, which is not useful and will make the + # truncation of the context fail (the tokenized question will take a lots of space). So we remove that + # left whitespace + examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]] + + # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results + # in one example possible giving several features when a context is long, each of those features having a + # context that overlaps a bit the context of the previous feature. + tokenized_examples = self.tokenizer( + examples[question_column_name if self.args["pad_on_right"] else context_column_name], + examples[context_column_name if self.args["pad_on_right"] else question_column_name], + truncation="only_second" if self.args["pad_on_right"] else "only_first", + max_length=self.args["max_seq_length"], + stride=self.args["doc_stride"], + return_overflowing_tokens=True, + return_offsets_mapping=True, + verbose=False, + padding="max_length", + ) - # Note that DistributedSampler samples randomly - eval_sampler = SequentialSampler(dataset) - eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.model_args.eval_batch_size) + # Since one example might give us several features if it has a long context, we need a map from a feature to + # its corresponding example. This key gives us just that. + sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") + + # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the + # corresponding example_id and we will store the offset mappings. + tokenized_examples["example_id"] = [] + + for i in range(len(tokenized_examples["input_ids"])): + # Grab the sequence corresponding to that example (to know what is the context and what is the question). + sequence_ids = tokenized_examples.sequence_ids(i) + context_index = 1 if self.args["pad_on_right"] else 0 + + # One example can give several spans, this is the index of the example containing this span of text. + sample_index = sample_mapping[i] + tokenized_examples["example_id"].append(examples["id"][sample_index]) + + # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token + # position is part of the context or not. + tokenized_examples["offset_mapping"][i] = [ + (o if sequence_ids[k] == context_index else None) + for k, o in enumerate(tokenized_examples["offset_mapping"][i]) + ] + return tokenized_examples + + def create_and_fill_np_array(start_or_end_logits, dataset, max_len): + """ + Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor + + Args: + start_or_end_logits(:obj:`tensor`): + This is the output predictions of the model. We can only enter either start or end logits. + eval_dataset: Evaluation dataset + max_len(:obj:`int`): + The maximum length of the output tensor. ( See the model.eval() part for more details ) + """ + + step = 0 + # create a numpy array and fill it with -100. + logits_concat = np.full((len(dataset), max_len), -100, dtype=np.float64) + # Now since we have create an array now we will populate it with the outputs gathered using accelerator.gather + for i, output_logit in enumerate(start_or_end_logits): # populate columns + # We have to fill it such that we have to take the whole tensor and replace it on the newly created array + # And after every iteration we have to change the step + + batch_size = output_logit.shape[0] + cols = output_logit.shape[1] + + if step + batch_size < len(dataset): + logits_concat[step: step + batch_size, :cols] = output_logit + else: + logits_concat[step:, :cols] = output_logit[: len(dataset) - step] + + step += batch_size + + return logits_concat + + def post_processing_function(examples, features, predictions, stage="eval"): + # Post-processing: we match the start logits and end logits to answers in the original context. + _, all_nbest_json = postprocess_qa_predictions( + examples=examples, + features=features, + predictions=predictions, + version_2_with_negative=self.args["version_2_with_negative"], + n_best_size=self.args["n_best_size"], + max_answer_length=self.args["max_answer_length"], + null_score_diff_threshold=self.args["null_score_diff_threshold"], + output_dir="./tmp/", + prefix=stage, + ) + return all_nbest_json + + + inputs = {"question": [], "context": [], "id": []} + for i, ctx in enumerate(contexts): + inputs["question"].append(question.text) + inputs["context"].append(contexts[i].text) + inputs["id"].append(i) + eval_examples = Dataset.from_dict(inputs) + column_names = eval_examples.column_names + eval_dataset = eval_examples.map( + prepare_validation_features, + batched=True, + num_proc=1, + remove_columns=column_names, + ) - all_results = [] + eval_dataset_for_model = eval_dataset.remove_columns(["example_id", "offset_mapping"]) + eval_dataloader = DataLoader( + eval_dataset_for_model, + collate_fn=default_data_collator, + batch_size=self.model_args.eval_batch_size, + ) + self.model.eval() + all_start_logits = [] + all_end_logits = [] for batch in eval_dataloader: - self.model.eval() - batch = tuple(t.to(self.device) for t in batch) + for k in batch: + batch[k] = batch[k].to(self.device) with torch.no_grad(): - inputs = { - "input_ids": batch[0], - "attention_mask": batch[1], - "token_type_ids": batch[2], - } - feature_indices = batch[3] - outputs = self.model(**inputs) - - for i, feature_index in enumerate(feature_indices): - eval_feature = features[feature_index.item()] - unique_id = int(eval_feature.unique_id) - - output = [outputs[oname][i] for oname in outputs] - start_logits = outputs.start_logits[i] - end_logits = outputs.end_logits[i] - try: - start_logits = start_logits.item() - end_logits = end_logits.item() - except: - pass - - result = SquadResult(unique_id, start_logits, end_logits) - all_results.append(result) - - answers, nbest = compute_predictions_logits( - all_examples=examples, - all_features=features, - all_results=all_results, - n_best_size=self.args["n_best_size"], - max_answer_length=self.args["max_answer_length"], - do_lower_case=self.args["do_lower_case"], - output_prediction_file=self.args["output_prediction_file"], - output_nbest_file=self.args["output_nbest_file"], - output_null_log_odds_file=self.args["output_null_log_odds_file"], - verbose_logging=self.args["verbose_logging"], - version_2_with_negative=self.args["version_2_with_negative"], - null_score_diff_threshold=self.args["null_score_diff_threshold"], - tokenizer=self.tokenizer, - language=question.language - ) + outputs = self.model(**batch) + start_logits = outputs.start_logits + end_logits = outputs.end_logits + all_start_logits.append(start_logits.cpu().numpy()) + all_end_logits.append(end_logits.cpu().numpy()) - all_answers = [] + start_logits_concat = create_and_fill_np_array(all_start_logits, eval_dataset, self.args["max_answer_length"]) + end_logits_concat = create_and_fill_np_array(all_end_logits, eval_dataset, self.args["max_answer_length"]) - for idx, ans in enumerate(nbest): + del all_start_logits + del all_end_logits + + outputs_numpy = (start_logits_concat, end_logits_concat) + + all_nbest_json = post_processing_function(eval_examples, eval_dataset, outputs_numpy) + + all_answers = [] + for idx, ans in enumerate(all_nbest_json): all_answers.append(Answer( - text=nbest[ans][0]["text"], - score=nbest[ans][0]["start_logit"] + nbest[ans][0]["end_logit"], + text=all_nbest_json[ans][0]["text"], + score=all_nbest_json[ans][0]["probability"], + # score=all_nbest_json[ans][0]["start_logit"] + all_nbest_json[ans][0]["end_logit"], ctx_score=contexts[idx].score, language=question.language )) diff --git a/bertserini/reader/t5_reader.py b/bertserini/reader/t5_reader.py new file mode 100644 index 0000000..347181a --- /dev/null +++ b/bertserini/reader/t5_reader.py @@ -0,0 +1,208 @@ +from typing import List + +import torch +from torch.utils.data import DataLoader, SequentialSampler +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, default_data_collator, EvalPrediction +import datasets +from datasets import Dataset +import numpy as np +from typing import List, Optional, Tuple + +from bertserini.reader.base import Reader, Question, Context, Answer + +from datasets.utils import logging + +__all__ = ['T5'] +class T5(Reader): + def __init__(self, args): + self.model_args = args + if self.model_args.tokenizer_name is None: + self.model_args.tokenizer_name = self.model_args.model_name_or_path + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_args.model_name_or_path).to(self.device).eval() + self.tokenizer = AutoTokenizer.from_pretrained(self.model_args.tokenizer_name, do_lower_case=True) + self.question_column = 'question' + self.context_column = 'context' + self.answer_column = 'answers' + ''' + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 1 \ + --output_dir ./models/s2s_squad2_0train/ \ + --eval_accumulation_steps 1 \ + --predict_with_generate \ + ''' + self.args = { + "max_seq_length": 384, + "doc_stride": 128, + "max_query_length": 64, + "threads": 1, + "tqdm_enabled": False, + "n_best_size": 20, + "max_answer_length": 384, + "do_lower_case": True, + "output_prediction_file": False, + "output_nbest_file": self.model_args.output_nbest_file, + "output_null_log_odds_file": None, + "verbose_logging": False, + "version_2_with_negative": True, + "null_score_diff_threshold": 0, + "ignore_pad_token_for_loss": True + } + + def update_args(self, args_to_change): + for key in args_to_change: + self.args[key] = args_to_change[key] + + def predict(self, question: Question, contexts: List[Context]) -> List[Answer]: + logging.disable_progress_bar() + + def preprocess_squad_batch( + examples, + question_column: str, + context_column: str, + answer_column: str, + ) -> Tuple[List[str], List[str]]: + questions = examples[question_column] + contexts = examples[context_column] + answers = examples.get(answer_column,[]) + + def generate_input(_question, _context): + return " ".join(["question:", _question.lstrip(), "context:", _context.lstrip()]) + + inputs = [generate_input(question, context) for question, context in zip(questions, contexts)] + targets = [answer["text"][0] if len(answer["text"]) > 0 else "" for answer in answers] + return inputs, targets + + def preprocess_function(examples): + inputs, targets = preprocess_squad_batch(examples, self.question_column, self.context_column, self.answer_column) + + model_inputs = self.tokenizer(inputs, max_length=self.args["max_seq_length"], padding='max_length', truncation=True) + # Setup the tokenizer for targets + with self.tokenizer.as_target_tokenizer(): + labels = self.tokenizer(targets, max_length=self.args['max_answer_length'], padding='max_length', truncation=True) + + # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore + # padding in the loss. + if self.args['ignore_pad_token_for_loss']: + labels["input_ids"] = [ + [(l if l != self.tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] + ] + + model_inputs["labels"] = labels["input_ids"] + return model_inputs + + # Validation preprocessing + def preprocess_validation_function(examples): + inputs, targets = preprocess_squad_batch(examples, self.question_column, self.context_column, self.answer_column) + + model_inputs = self.tokenizer( + inputs, + max_length=self.args["max_seq_length"], + padding='max_length', + truncation=True, + return_offsets_mapping=True, + ) + + if targets: + # Setup the tokenizer for targets + with self.tokenizer.as_target_tokenizer(): + labels = self.tokenizer(targets, max_length=self.args['max_answer_length'], padding='max_length', truncation=True) + + # Since one example might give us several features if it has a long context, we need a map from a feature to + # its corresponding example. This key gives us just that. + # sample_mapping = model_inputs.pop("overflow_to_sample_mapping") + sample_mapping = list(range(len(model_inputs["input_ids"]))) + + # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the + # corresponding example_id and we will store the offset mappings. + model_inputs["example_id"] = [] + + for i in range(len(model_inputs["input_ids"])): + # One example can give several spans, this is the index of the example containing this span of text. + sample_index = sample_mapping[i] + model_inputs["example_id"].append(examples["id"][sample_index]) + + if targets: + # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore + # padding in the loss. + if self.args['ignore_pad_token_for_loss']: + labels["input_ids"] = [ + [(l if l != self.tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] + ] + + model_inputs["labels"] = labels["input_ids"] + + return model_inputs + + def post_processing_function(examples: datasets.Dataset, features: datasets.Dataset, outputs, stage="eval"): + # Decode the predicted tokens. + decoded_preds = self.tokenizer.batch_decode(outputs, skip_special_tokens=True) + + # Build a map example to its corresponding features. + example_id_to_index = {k: i for i, k in enumerate(examples["id"])} + feature_per_example = {example_id_to_index[feature["example_id"]]: i for i, feature in enumerate(features)} + predictions = {} + # Let's loop over all the examples! + for example_index, example in enumerate(examples): + # This is the index of the feature associated to the current example. + feature_index = feature_per_example[example_index] + predictions[example["id"]] = decoded_preds[feature_index] + + # Format the result to the format the metric expects. + if self.args['version_2_with_negative']: + formatted_predictions = [ + {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items() + ] + else: + formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] + + # references = [{"id": ex["id"], "answers": ex[self.answer_column]} for ex in examples] + # return EvalPrediction(predictions=formatted_predictions, label_ids=references) + return formatted_predictions + + + + inputs = {"question": [], "context": [], "id": []} + for i, ctx in enumerate(contexts): + inputs["question"].append(question.text) + inputs["context"].append(contexts[i].text) + inputs["id"].append(i) + print(inputs) + eval_examples = Dataset.from_dict(inputs) + column_names = eval_examples.column_names + eval_dataset = eval_examples.map( + preprocess_validation_function, + batched=True, + num_proc=1, + remove_columns=column_names, + ) + + eval_dataset_for_model = eval_dataset.remove_columns(["example_id", "offset_mapping"]) + + eval_dataloader = DataLoader( + eval_dataset_for_model, + collate_fn=default_data_collator, + batch_size=self.model_args.eval_batch_size, + ) + raw_predict = [] + for batch in eval_dataloader: + for k in batch: + batch[k] = batch[k].to(self.device) + outs = self.model.generate(input_ids=batch['input_ids'], + attention_mask=batch['attention_mask'], + max_length=16, + early_stopping=True) + raw_predict.extend(outs) + all_nbest_json = post_processing_function(eval_examples, eval_dataset, raw_predict) + + all_answers = [] + for item in all_nbest_json: + all_answers.append(Answer( + text=item["prediction_text"], + score=0.0, + # score=all_nbest_json[ans][0]["start_logit"] + all_nbest_json[ans][0]["end_logit"], + ctx_score=contexts[item['id']].score, + language=question.language + )) + return all_answers + diff --git a/bertserini/retriever/pyserini_retriever.py b/bertserini/retriever/pyserini_retriever.py index fdd283e..ad17c27 100644 --- a/bertserini/retriever/pyserini_retriever.py +++ b/bertserini/retriever/pyserini_retriever.py @@ -86,5 +86,6 @@ def hits_to_contexts(hits: List[JLuceneSearcherResult], language="en", field='ra if s in t: continue metadata = {} - contexts.append(Context(t, language, metadata, score)) + + contexts.append(Context(hit, language=language, metadata=metadata, score=score)) return contexts diff --git a/bertserini/utils/utils.py b/bertserini/utils/utils.py index 5a3d2e4..ad4f2b6 100644 --- a/bertserini/utils/utils.py +++ b/bertserini/utils/utils.py @@ -5,7 +5,9 @@ import re import zhon import numpy as np +from hanziconv import HanziConv +from bertserini.reader.base import Question def strip_accents(text): return "".join(char for char in unicodedata.normalize('NFKD', text) @@ -174,3 +176,25 @@ def remove_punc(text): return ''.join(ch for ch in text if ch not in exclude) return remove_punc(s) + +def get_best_answer(candidates, weight=0.5): + for ans in candidates: + ans.aggregate_score(weight) + return sorted(candidates, key=lambda x: x.total_score, reverse=True)[0] + + +def extract_squad_questions(squad_filename, do_strip_accents=False, language="en"): + data = json.load(open(squad_filename, 'r')) + data = data["data"] + questions = [] + for article in data: + for paragraph in article["paragraphs"]: + for qa in paragraph["qas"]: + id_ = qa["id"] + question = qa["question"] + if do_strip_accents: + question = strip_accents(question) + if language == "zh": + HanziConv.toSimplified(question) + questions.append(Question(question, id_, language)) + return questions diff --git a/bertserini/utils/utils_new.py b/bertserini/utils/utils_new.py deleted file mode 100644 index 09ee01f..0000000 --- a/bertserini/utils/utils_new.py +++ /dev/null @@ -1,28 +0,0 @@ -import json -from hanziconv import HanziConv - -from bertserini.reader.base import Question -from bertserini.utils.utils import strip_accents - - -def get_best_answer(candidates, weight=0.5): - for ans in candidates: - ans.aggregate_score(weight) - return sorted(candidates, key=lambda x: x.total_score, reverse=True)[0] - - -def extract_squad_questions(squad_filename, do_strip_accents=False, language="en"): - data = json.load(open(squad_filename, 'r')) - data = data["data"] - questions = [] - for article in data: - for paragraph in article["paragraphs"]: - for qa in paragraph["qas"]: - id_ = qa["id"] - question = qa["question"] - if do_strip_accents: - question = strip_accents(question) - if language == "zh": - HanziConv.toSimplified(question) - questions.append(Question(question, id_, language)) - return questions \ No newline at end of file diff --git a/bertserini/utils/utils_qa.py b/bertserini/utils/utils_qa.py new file mode 100644 index 0000000..dedcd52 --- /dev/null +++ b/bertserini/utils/utils_qa.py @@ -0,0 +1,434 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Team All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Post-processing utilities for question answering. +""" +import collections +import json +import logging +import os +from typing import Optional, Tuple + +import numpy as np +from tqdm.auto import tqdm + + +logger = logging.getLogger(__name__) + + +def postprocess_qa_predictions( + examples, + features, + predictions: Tuple[np.ndarray, np.ndarray], + version_2_with_negative: bool = False, + n_best_size: int = 20, + max_answer_length: int = 30, + null_score_diff_threshold: float = 0.0, + output_dir: Optional[str] = None, + prefix: Optional[str] = None, + log_level: Optional[int] = logging.WARNING, +): + """ + Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the + original contexts. This is the base postprocessing functions for models that only return start and end logits. + + Args: + examples: The non-preprocessed dataset (see the main script for more information). + features: The processed dataset (see the main script for more information). + predictions (:obj:`Tuple[np.ndarray, np.ndarray]`): + The predictions of the model: two arrays containing the start logits and the end logits respectively. Its + first dimension must match the number of elements of :obj:`features`. + version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the underlying dataset contains examples with no answers. + n_best_size (:obj:`int`, `optional`, defaults to 20): + The total number of n-best predictions to generate when looking for an answer. + max_answer_length (:obj:`int`, `optional`, defaults to 30): + The maximum length of an answer that can be generated. This is needed because the start and end predictions + are not conditioned on one another. + null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0): + The threshold used to select the null answer: if the best answer has a score that is less than the score of + the null answer minus this threshold, the null answer is selected for this example (note that the score of + the null answer for an example giving several features is the minimum of the scores for the null answer on + each feature: all features must be aligned on the fact they `want` to predict a null answer). + + Only useful when :obj:`version_2_with_negative` is :obj:`True`. + output_dir (:obj:`str`, `optional`): + If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if + :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null + answers, are saved in `output_dir`. + prefix (:obj:`str`, `optional`): + If provided, the dictionaries mentioned above are saved with `prefix` added to their names. + log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``): + ``logging`` log level (e.g., ``logging.WARNING``) + """ + if len(predictions) != 2: + raise ValueError("`predictions` should be a tuple with two elements (start_logits, end_logits).") + all_start_logits, all_end_logits = predictions + + if len(predictions[0]) != len(features): + raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.") + + # Build a map example to its corresponding features. + example_id_to_index = {k: i for i, k in enumerate(examples["id"])} + features_per_example = collections.defaultdict(list) + for i, feature in enumerate(features): + features_per_example[example_id_to_index[feature["example_id"]]].append(i) + + # The dictionaries we have to fill. + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + if version_2_with_negative: + scores_diff_json = collections.OrderedDict() + + # Logging. + logger.setLevel(log_level) + logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.") + + # Let's loop over all the examples! + for example_index, example in enumerate(examples): + # Those are the indices of the features associated to the current example. + feature_indices = features_per_example[example_index] + + min_null_prediction = None + prelim_predictions = [] + + # Looping through all the features associated to the current example. + for feature_index in feature_indices: + # We grab the predictions of the model for this feature. + start_logits = all_start_logits[feature_index] + end_logits = all_end_logits[feature_index] + # This is what will allow us to map some the positions in our logits to span of texts in the original + # context. + offset_mapping = features[feature_index]["offset_mapping"] + # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context + # available in the current feature. + token_is_max_context = features[feature_index].get("token_is_max_context", None) + + # Update minimum null prediction. + feature_null_score = start_logits[0] + end_logits[0] + if min_null_prediction is None or min_null_prediction["score"] > feature_null_score: + min_null_prediction = { + "offsets": (0, 0), + "score": feature_null_score, + "start_logit": start_logits[0], + "end_logit": end_logits[0], + } + + # Go through all possibilities for the `n_best_size` greater start and end logits. + start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist() + end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist() + for start_index in start_indexes: + for end_index in end_indexes: + # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond + # to part of the input_ids that are not in the context. + if ( + start_index >= len(offset_mapping) + or end_index >= len(offset_mapping) + or offset_mapping[start_index] is None + or len(offset_mapping[start_index]) < 2 + or offset_mapping[end_index] is None + or len(offset_mapping[end_index]) < 2 + ): + continue + # Don't consider answers with a length that is either < 0 or > max_answer_length. + if end_index < start_index or end_index - start_index + 1 > max_answer_length: + continue + # Don't consider answer that don't have the maximum context available (if such information is + # provided). + if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False): + continue + + prelim_predictions.append( + { + "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]), + "score": start_logits[start_index] + end_logits[end_index], + "start_logit": start_logits[start_index], + "end_logit": end_logits[end_index], + } + ) + if version_2_with_negative: + # Add the minimum null prediction + prelim_predictions.append(min_null_prediction) + null_score = min_null_prediction["score"] + + # Only keep the best `n_best_size` predictions. + predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size] + + # Add back the minimum null prediction if it was removed because of its low score. + if version_2_with_negative and not any(p["offsets"] == (0, 0) for p in predictions): + predictions.append(min_null_prediction) + + # Use the offsets to gather the answer text in the original context. + context = example["context"] + for pred in predictions: + offsets = pred.pop("offsets") + pred["text"] = context[offsets[0] : offsets[1]] + + # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid + # failure. + if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""): + predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0}) + + # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using + # the LogSumExp trick). + probs = np.array([pred.pop("score") for pred in predictions]) + # exp_scores = np.exp(scores - np.max(scores)) + # probs = exp_scores / exp_scores.sum() + + # Include the probabilities in our predictions. + for prob, pred in zip(probs, predictions): + pred["probability"] = prob + + # Pick the best prediction. If the null answer is not possible, this is easy. + if not version_2_with_negative: + all_predictions[example["id"]] = predictions[0]["text"] + else: + # Otherwise we first need to find the best non-empty prediction. + i = 0 + while predictions[i]["text"] == "": + i += 1 + best_non_null_pred = predictions[i] + + # Then we compare to the null prediction using the threshold. + score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"] + scores_diff_json[example["id"]] = float(score_diff) # To be JSON-serializable. + if score_diff > null_score_diff_threshold: + all_predictions[example["id"]] = "" + else: + all_predictions[example["id"]] = best_non_null_pred["text"] + + # Make `predictions` JSON-serializable by casting np.float back to float. + all_nbest_json[example["id"]] = [ + {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()} + for pred in predictions + ] + + # If we have an output_dir, let's save all those dicts. + if output_dir is not None: + if not os.path.isdir(output_dir): + raise EnvironmentError(f"{output_dir} is not a directory.") + + prediction_file = os.path.join( + output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json" + ) + nbest_file = os.path.join( + output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json" + ) + if version_2_with_negative: + null_odds_file = os.path.join( + output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json" + ) + + logger.info(f"Saving predictions to {prediction_file}.") + with open(prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + logger.info(f"Saving nbest_preds to {nbest_file}.") + with open(nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + if version_2_with_negative: + logger.info(f"Saving null_odds to {null_odds_file}.") + with open(null_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + + return all_predictions, all_nbest_json + + +def postprocess_qa_predictions_with_beam_search( + examples, + features, + predictions: Tuple[np.ndarray, np.ndarray], + version_2_with_negative: bool = False, + n_best_size: int = 20, + max_answer_length: int = 30, + start_n_top: int = 5, + end_n_top: int = 5, + output_dir: Optional[str] = None, + prefix: Optional[str] = None, + log_level: Optional[int] = logging.WARNING, +): + """ + Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the + original contexts. This is the postprocessing functions for models that return start and end logits, indices, as well as + cls token predictions. + + Args: + examples: The non-preprocessed dataset (see the main script for more information). + features: The processed dataset (see the main script for more information). + predictions (:obj:`Tuple[np.ndarray, np.ndarray]`): + The predictions of the model: two arrays containing the start logits and the end logits respectively. Its + first dimension must match the number of elements of :obj:`features`. + version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the underlying dataset contains examples with no answers. + n_best_size (:obj:`int`, `optional`, defaults to 20): + The total number of n-best predictions to generate when looking for an answer. + max_answer_length (:obj:`int`, `optional`, defaults to 30): + The maximum length of an answer that can be generated. This is needed because the start and end predictions + are not conditioned on one another. + start_n_top (:obj:`int`, `optional`, defaults to 5): + The number of top start logits too keep when searching for the :obj:`n_best_size` predictions. + end_n_top (:obj:`int`, `optional`, defaults to 5): + The number of top end logits too keep when searching for the :obj:`n_best_size` predictions. + output_dir (:obj:`str`, `optional`): + If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if + :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null + answers, are saved in `output_dir`. + prefix (:obj:`str`, `optional`): + If provided, the dictionaries mentioned above are saved with `prefix` added to their names. + log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``): + ``logging`` log level (e.g., ``logging.WARNING``) + """ + if len(predictions) != 5: + raise ValueError("`predictions` should be a tuple with five elements.") + start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions + + if len(predictions[0]) != len(features): + raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.") + + # Build a map example to its corresponding features. + example_id_to_index = {k: i for i, k in enumerate(examples["id"])} + features_per_example = collections.defaultdict(list) + for i, feature in enumerate(features): + features_per_example[example_id_to_index[feature["example_id"]]].append(i) + + # The dictionaries we have to fill. + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() if version_2_with_negative else None + + # Logging. + logger.setLevel(log_level) + logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.") + + # Let's loop over all the examples! + for example_index, example in enumerate(examples): + # Those are the indices of the features associated to the current example. + feature_indices = features_per_example[example_index] + + min_null_score = None + prelim_predictions = [] + + # Looping through all the features associated to the current example. + for feature_index in feature_indices: + # We grab the predictions of the model for this feature. + start_log_prob = start_top_log_probs[feature_index] + start_indexes = start_top_index[feature_index] + end_log_prob = end_top_log_probs[feature_index] + end_indexes = end_top_index[feature_index] + feature_null_score = cls_logits[feature_index] + # This is what will allow us to map some the positions in our logits to span of texts in the original + # context. + offset_mapping = features[feature_index]["offset_mapping"] + # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context + # available in the current feature. + token_is_max_context = features[feature_index].get("token_is_max_context", None) + + # Update minimum null prediction + if min_null_score is None or feature_null_score < min_null_score: + min_null_score = feature_null_score + + # Go through all possibilities for the `n_start_top`/`n_end_top` greater start and end logits. + for i in range(start_n_top): + for j in range(end_n_top): + start_index = int(start_indexes[i]) + j_index = i * end_n_top + j + end_index = int(end_indexes[j_index]) + # Don't consider out-of-scope answers (last part of the test should be unnecessary because of the + # p_mask but let's not take any risk) + if ( + start_index >= len(offset_mapping) + or end_index >= len(offset_mapping) + or offset_mapping[start_index] is None + or offset_mapping[end_index] is None + ): + continue + # Don't consider answers with a length negative or > max_answer_length. + if end_index < start_index or end_index - start_index + 1 > max_answer_length: + continue + # Don't consider answer that don't have the maximum context available (if such information is + # provided). + if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False): + continue + prelim_predictions.append( + { + "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]), + "score": start_log_prob[i] + end_log_prob[j_index], + "start_log_prob": start_log_prob[i], + "end_log_prob": end_log_prob[j_index], + } + ) + + # Only keep the best `n_best_size` predictions. + predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size] + + # Use the offsets to gather the answer text in the original context. + context = example["context"] + for pred in predictions: + offsets = pred.pop("offsets") + pred["text"] = context[offsets[0] : offsets[1]] + + # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid + # failure. + if len(predictions) == 0: + predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": -2e-6}) + + # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using + # the LogSumExp trick). + scores = np.array([pred.pop("score") for pred in predictions]) + exp_scores = np.exp(scores - np.max(scores)) + probs = exp_scores / exp_scores.sum() + + # Include the probabilities in our predictions. + for prob, pred in zip(probs, predictions): + pred["probability"] = prob + + # Pick the best prediction and set the probability for the null answer. + all_predictions[example["id"]] = predictions[0]["text"] + if version_2_with_negative: + scores_diff_json[example["id"]] = float(min_null_score) + + # Make `predictions` JSON-serializable by casting np.float back to float. + all_nbest_json[example["id"]] = [ + {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()} + for pred in predictions + ] + + # If we have an output_dir, let's save all those dicts. + if output_dir is not None: + if not os.path.isdir(output_dir): + raise EnvironmentError(f"{output_dir} is not a directory.") + + prediction_file = os.path.join( + output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json" + ) + nbest_file = os.path.join( + output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json" + ) + if version_2_with_negative: + null_odds_file = os.path.join( + output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json" + ) + + logger.info(f"Saving predictions to {prediction_file}.") + with open(prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + logger.info(f"Saving nbest_preds to {nbest_file}.") + with open(nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + if version_2_with_negative: + logger.info(f"Saving null_odds to {null_odds_file}.") + with open(null_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + + return all_predictions, scores_diff_json diff --git a/bertserini/utils/utils_squad_metrics.py b/bertserini/utils/utils_squad_metrics.py index e3b5716..284af9e 100644 --- a/bertserini/utils/utils_squad_metrics.py +++ b/bertserini/utils/utils_squad_metrics.py @@ -251,7 +251,7 @@ def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_ return evaluation -def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False, language="en", tokenizer_name="rsvp-ai/bertserini-bert-base-squad"): +def get_final_text(pred_text, orig_text, do_lower_case, tokenizer, verbose_logging=False, language="en", tokenizer_name="rsvp-ai/bertserini-bert-base-squad"): """Project the tokenized prediction back to the original text.""" # When we created the data, we kept track of the alignment between original @@ -295,7 +295,7 @@ def _strip_spaces(text): # NOT the same length, the heuristic has failed. If they are the same # length, we assume the characters are one-to-one aligned. - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=False) + #tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True) if language=="zh": tok_text = "".join(tokenizer.tokenize(orig_text)) else: @@ -422,6 +422,8 @@ def compute_predictions_logits( all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() scores_diff_json = collections.OrderedDict() + import os + os.environ["TOKENIZERS_PARALLELISM"] = "false" for (example_index, example) in enumerate(all_examples): features = example_index_to_features[example_index] @@ -491,7 +493,9 @@ def compute_predictions_logits( seen_predictions = {} nbest = [] + c = 0 for pred in prelim_predictions: + c += 1 if len(nbest) >= n_best_size: break feature = features[pred.feature_index] @@ -518,7 +522,7 @@ def compute_predictions_logits( tok_text = " ".join(tok_text.split()) orig_text = " ".join(orig_tokens) - final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging, language=language) + final_text = get_final_text(tok_text, orig_text, do_lower_case, tokenizer, verbose_logging=verbose_logging, language=language) if "##" in final_text or "[UNK]" in final_text: print(final_text, "||", tok_text, "||", orig_text) @@ -736,7 +740,7 @@ def compute_predictions_log_probs( else: do_lower_case = tokenizer.do_lowercase_and_remove_accent - final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) + final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging, tokenizer) if final_text in seen_predictions: continue diff --git a/inference_test.py b/inference_test.py index 69bf0f7..edc06a7 100644 --- a/inference_test.py +++ b/inference_test.py @@ -1,15 +1,16 @@ from bertserini.reader.base import Question, Context from bertserini.reader.bert_reader import BERT from bertserini.reader.dpr_reader import DPR -from bertserini.utils.utils_new import get_best_answer +from bertserini.reader.t5_reader import T5 +from bertserini.utils.utils import get_best_answer from bertserini.experiments.args import * from bertserini.retriever.pyserini_retriever import retriever, build_searcher -ENG_reader = "BERT" +ENG_reader = "T5" do_local_test = True do_bm25_test = True -do_dpr_test = True -do_chinese_test = True +do_dpr_test = False +do_chinese_test = False if ENG_reader == "BERT": args.model_name_or_path = "rsvp-ai/bertserini-bert-base-squad" @@ -21,19 +22,23 @@ args.tokenizer_name = "facebook/dpr-reader-multiset-base" bert_reader = DPR(args) -question = Question("Why did Mark Twain call the 19th century the glied age?") +elif ENG_reader == "T5": + args.model_name_or_path = "/data/aileen/workspace/t5_test2/models/gpu/checkpoint-10500" + args.tokenizer_name = "t5-base" + bert_reader = T5(args) + +# question = Question("Why did Mark Twain call the 19th century the glied age?") +question = Question("What is the capital city of China?") + print(question.text) if do_local_test: print("######################### Testing Local Context #########################") - contexts = [Context('The "Gilded Age" was a term that Mark Twain used to describe the period of the late 19th century when there had been a dramatic expansion of American wealth and prosperity.')] + contexts = [Context('The "Gilded Age" was a term that Mark Twain used to describe the period of the late 19th century when there had been a dramatic expansion of American wealth and prosperity.'), + Context('The "Gilded Age"')] candidates = bert_reader.predict(question, contexts) answer = get_best_answer(candidates, 1.0) print("Answer:", answer.text) - if answer.text == "there had been a dramatic expansion of American wealth and prosperity": - print("Local Context Test Passed") - else: - print("Wrong Answer") if do_bm25_test: print("######################### Testing BM25 Context #########################") @@ -41,9 +46,10 @@ searcher = build_searcher(args) contexts = retriever(question, searcher, 10) candidates = bert_reader.predict(question, contexts) + print(candidates) answer = get_best_answer(candidates, 0.45) print("Answer:", answer.text) # todo: no context returned. is the context included? maybe update to another question - print("BM25 Test Passed") + # print("BM25 Test Passed") if do_dpr_test: print("######################### Testing DPR Context #########################") @@ -51,7 +57,7 @@ args.encoder = "facebook/dpr-question_encoder-multiset-base" args.query_tokenizer_name = "facebook/dpr-question_encoder-multiset-base" args.index_path = "../pyserini/dpr-ctx_encoder-multiset-base.ik-nlp-22_slp" # todo: replicate dpr on wiki and release dpr-indexes - args.device = "cuda:0" + args.device = "cuda:cpu" args.sparse_index = "../anserini/lucene-index.ik-nlp-22" searcher = build_searcher(args) contexts = retriever(question, searcher, 10)