diff --git a/.github/code_spell_ignore.txt b/.github/code_spell_ignore.txt index e69de29b..c7e0b759 100644 --- a/.github/code_spell_ignore.txt +++ b/.github/code_spell_ignore.txt @@ -0,0 +1,3 @@ +rouge +Rouge +ROUGE \ No newline at end of file diff --git a/evals/evaluation/rag_eval/README.md b/evals/evaluation/rag_eval/README.md new file mode 100644 index 00000000..1fa85138 --- /dev/null +++ b/evals/evaluation/rag_eval/README.md @@ -0,0 +1,32 @@ +# CRUD-RAG +[CRUD-RAG](https://arxiv.org/abs/2401.17043) is a Chinese benchmark for RAG (Retrieval-Augmented Generation) system. This example utilize CRUD-RAG for evaluating the RAG system. + +## Prerequisite + +### Environment +```bash +pip install -r requirements.txt +``` + +### Prepare Dataset +We use the evaluation dataset from [CRUD-RAG](https://github.com/IAAR-Shanghai/CRUD_RAG) repo, use the below command to prepare the dataset. +```bash +git clone https://github.com/IAAR-Shanghai/CRUD_RAG +mkdir data/ +cp CRUD_RAG/data/crud_split/split_merged.json data/ +cp -r CRUD_RAG/data/80000_docs/ data/ +python examples/process_crud_dataset.py +``` + +### Launch Service of RAG System +Please refer to this [guide](https://github.com/opea-project/GenAIExamples/blob/main/ChatQnA/README.md) to launch the service of RAG system. + +## Evaluation +Use below command to run the evaluation, please note that for the first run, argument `--ingest_docs` should be added in the command to ingest the documents into the vector database, while for the subsequent run, this argument should be omitted. +```bash +cd examples +python main.py --dataset_path ../data/split_merged.json --docs_path ../data/80000_docs --ingest_docs +``` + +## Acknowledgements +This example is mostly adapted from [CRUD-RAG](https://github.com/IAAR-Shanghai/CRUD_RAG) repo, we thank the authors for their great work! diff --git a/evals/evaluation/rag_eval/__init__.py b/evals/evaluation/rag_eval/__init__.py new file mode 100644 index 00000000..6c57244e --- /dev/null +++ b/evals/evaluation/rag_eval/__init__.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# + +from .evaluator import Evaluator + +__all__ = [Evaluator] diff --git a/evals/evaluation/rag_eval/evaluator.py b/evals/evaluation/rag_eval/evaluator.py new file mode 100644 index 00000000..6dfe6677 --- /dev/null +++ b/evals/evaluation/rag_eval/evaluator.py @@ -0,0 +1,215 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import datetime +import json +import os + +import requests +from tqdm import tqdm + +from evals.metrics import bleu_score, rougeL_score + +from .metrics import LLM_score + + +class Evaluator: + def __init__( + self, + dataset: list[dict], + output_path: str, + task: str, + ) -> None: + """Args: + dataset (list[dict]): The dataset for evaluation. + output_path (str): The path to save results. + task (str): Task to evaluate. + """ + self.task = task + self.output_path = output_path + self.dataset = dataset + + @staticmethod + def ingest_docs(documents_path: str, database_endpoint: str): + """Args: + documents_path (str): The path to documents. + database_endpoint (str): URL of database. + """ + files = [] + if os.path.isfile(documents_path): + files.append(documents_path) + elif os.path.isdir(documents_path): + for root, dirs, files_ in os.walk(documents_path): + files += [os.path.join(root, f) for f in files_] + for file in tqdm(files): + file_obj = open(file, mode="rb") + response = requests.post(database_endpoint, files={"files": file_obj}) + if response.ok: + print(f"Successfully ingested {file}.") + else: + print(f"Failed to ingest {file}.") + file_obj.close() + + def get_ground_truth_text(self, data: dict): + raise NotImplementedError("Depends on the specific dataset.") + + def get_query(self, data: dict): + raise NotImplementedError("Depends on the specific dataset.") + + def get_document(self, data: dict): + raise NotImplementedError("Depends on the specific dataset.") + + def scoring(self, data: dict, llm_endpoint: str = None) -> dict: + generated_text = data["generated_text"] + ground_truth_text = self.get_ground_truth_text(data) + data["ground_truth_text"] = ground_truth_text + + bleu_avg, bleu1, bleu2, bleu3, bleu4 = bleu_score(generated_text, ground_truth_text) + + return { + "metrics": { + "bleu-avg": bleu_avg or 0.0, + "bleu-1": bleu1 or 0.0, + "bleu-2": bleu2 or 0.0, + "bleu-3": bleu3 or 0.0, + "bleu-4": bleu4 or 0.0, + "rouge-L": rougeL_score(generated_text, ground_truth_text) or 0.0, + "LLM-score": LLM_score(generated_text, ground_truth_text, llm_endpoint) or 0.0, + "length": len(generated_text), + }, + "log": { + "generated_text": generated_text, + "ground_truth_text": ground_truth_text, + "evaluateDatetime": str(datetime.datetime.now()), + }, + "valid": len(generated_text.strip()) != 0, + } + + def compute_overall(self, results: list[dict]) -> dict: + overall = { + "bleu-avg": 0, + "bleu-1": 0, + "bleu-2": 0, + "bleu-3": 0, + "bleu-4": 0, + "rouge-L": 0, + "LLM-score": 0.0, + "length": 0, + } + + for result in results: + overall = {key: overall[key] + result["metrics"][key] for key in overall.keys()} + + overall_save = {f"avg. {key}": value / len(results) for key, value in overall.items()} + + overall_save["num"] = len(results) + + return overall_save + + def save_output(self, output: dict) -> None: + """Save evaluation results.""" + with open(self.output_path, "w", encoding="utf-8") as f: + json.dump(output, f, ensure_ascii=False, indent=4) + + def read_output(self) -> dict: + with open(self.output_path) as f: + return json.load(f) + + def remove_invalid(self, results: list[dict]) -> list[dict]: + """Remove invalid results from the list and return the cleaned results.""" + return [result for result in results if result["valid"]] + + def send_request(self, data, arguments): + service_url = arguments.service_url + headers = {"Content-Type": "application/json"} + json_data = {} + query = self.get_query(data) + json_data["messages"] = query + json_data["stream"] = False + json_data["temperature"] = arguments.temperature + json_data["max_new_tokens"] = arguments.max_new_tokens + json_data = json.dumps(json_data) + response = requests.post(service_url, data=json_data, headers=headers) + if response.ok: + return response.json()["choices"][0]["message"]["content"] + else: + print(f"Request for pipeline failed due to {response.text}.") + return "" + + def get_retrieved_documents(self, data, arguments): + query = self.get_query(data) + data = {"text": query} + headers = {"Content-Type": "application/json"} + response = requests.post(arguments.embedding_endpoint, data=json.dumps(data), headers=headers) + if response.ok: + embedding = response.json()["embedding"] + else: + print(f"Request for embedding failed due to {response.text}.") + return [] + data = { + "text": query, + "embedding": embedding, + "search_typ": "similarity", + "k": 4, + "fetch_k": 20, + "lambda_mult": 0.5, + } + response = requests.post(arguments.retrieval_endpoint, data=json.dumps(data), headers=headers) + if response.ok: + retrieved_documents = response.json()["retrieved_docs"] + return [doc["text"] for doc in retrieved_documents] + else: + print(f"Request for retrieval failed due to {response.text}.") + return [] + + def scoring_retrieval(self, data, retrieved_documents): + ground_truth_documents = self.get_document(data) + + def evaluate(self, arguments, sort=True, show_progress_bar=False, contain_original_data=False): + """Run a complete evaluation. + + Args: + arguments: Arguments. + sort (bool): Whether to sort the results by id. + show_progress_bar (bool): Whether to display a progress bar. + contain_original_data (bool): Whether to include original data in the results for debugging. + + Returns: + dict: Output dictionary contains fields such as: overall, results, etc. + """ + if os.path.exists(self.output_path): # Resume evaluation + results = self.read_output().get("results", []) + results = self.remove_invalid(results) + saved_ids = [result["id"] for result in results] + else: + results = [] + saved_ids = [] + + for data in tqdm(self.dataset) if show_progress_bar else self.dataset: + if data["ID"] in saved_ids: + continue # Skip results that have already been evaluated and are valid + try: + retrieved_documents = self.get_retrieved_documents(data, arguments) + data["retrieved_documents"] = retrieved_documents + generated_text = self.send_request(data, arguments) + data["generated_text"] = generated_text + result = {"id": data["ID"], **self.scoring(data, arguments.llm_endpoint)} + if contain_original_data: + result["original_data"] = data + results.append(result) + except Exception as e: + print(repr(e)) + + results = sorted(results, key=lambda x: x["id"]) if sort else results + valid_results = self.remove_invalid(results) + + try: + overall = self.compute_overall(valid_results) if len(valid_results) > 0 else {} + except Exception as e: + print(repr(e)) + overall = dict() + + output = {"overall": overall, "results": results} + self.save_output(output) + print(f"Output saved to {self.output_path}!") + return output diff --git a/evals/evaluation/rag_eval/examples/main.py b/evals/evaluation/rag_eval/examples/main.py new file mode 100644 index 00000000..8b8e5bff --- /dev/null +++ b/evals/evaluation/rag_eval/examples/main.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +import argparse +import json +import os + +from evals.evaluation.rag_eval import Evaluator + + +class CRUD_Evaluator(Evaluator): + def get_ground_truth_text(self, data: dict): + if self.task == "summarization": + ground_truth_text = data["summary"] + elif self.task == "question_answering": + ground_truth_text = data["answers"] + elif self.task == "continuation": + ground_truth_text = data["continuing"] + elif self.task == "hallucinated_modified": + ground_truth_text = data["hallucinatedMod"] + else: + raise NotImplementedError( + f"Unknown task {self.task}, only support " + "summarization, question_answering, continuation and hallucinated_modified." + ) + return ground_truth_text + + def get_query(self, data: dict): + if self.task == "summarization": + query = data["text"] + elif self.task == "question_answering": + query = data["questions"] + elif self.task == "continuation": + query = data["beginning"] + elif self.task == "hallucinated_modified": + query = data["newsBeginning"] + else: + raise NotImplementedError( + f"Unknown task {self.task}, only support " + "summarization, question_answering, continuation and hallucinated_modified." + ) + return query + + def get_document(self, data: dict): + if self.task == "summarization": + document = data["text"] + elif self.task == "question_answering": + document = data["news1"] + elif self.task == "continuation": + document = data["beginning"] + elif self.task == "hallucinated_modified": + document = data["newsBeginning"] + else: + raise NotImplementedError( + f"Unknown task {self.task}, only support " + "summarization, question_answering, continuation and hallucinated_modified." + ) + return document + + +def args_parser(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--service_url", type=str, default="http://localhost:8888/v1/chatqna", help="Service URL address." + ) + parser.add_argument("--output_dir", type=str, default="./output", help="Directory to save evaluation results.") + parser.add_argument( + "--temperature", type=float, default=0.1, help="Controls the randomness of the model's text generation" + ) + parser.add_argument( + "--max_new_tokens", type=int, default=1280, help="Maximum number of new tokens to be generated by the model" + ) + parser.add_argument("--dataset_path", default="../data/split_merged.json", help="Path to the dataset") + parser.add_argument("--docs_path", default="../data/80000_docs", help="Path to the retrieval documents") + + # Retriever related options + parser.add_argument("--tasks", default=["question_answering"], nargs="+", help="Task to perform") + parser.add_argument("--ingest_docs", action="store_true", help="Whether to ingest documents to vector database") + parser.add_argument( + "--database_endpoint", type=str, default="http://localhost:6007/v1/dataprep", help="Service URL address." + ) + parser.add_argument( + "--embedding_endpoint", type=str, default="http://localhost:6000/v1/embeddings", help="Service URL address." + ) + parser.add_argument( + "--retrieval_endpoint", type=str, default="http://localhost:7000/v1/retrieval", help="Service URL address." + ) + parser.add_argument( + "--llm_endpoint", type=str, default="http://localhost:9009/generate", help="Service URL address." + ) + parser.add_argument( + "--show_progress_bar", action="store", default=True, type=bool, help="Whether to show a progress bar" + ) + parser.add_argument("--contain_original_data", action="store_true", help="Whether to contain original data") + + args = parser.parse_args() + return args + + +def main(): + args = args_parser() + if os.path.isfile(args.dataset_path): + with open(args.dataset_path) as f: + all_datasets = json.load(f) + else: + raise FileNotFoundError(f"Evaluation dataset file {args.dataset_path} not exist.") + os.makedirs(args.output_dir, exist_ok=True) + for task in args.tasks: + if task == "question_answering": + dataset = all_datasets["questanswer_1doc"] + elif task == "summarization": + dataset = all_datasets["event_summary"] + else: + raise NotImplementedError( + f"Unknown task {task}, only support " + "summarization, question_answering, continuation and hallucinated_modified." + ) + output_save_path = os.path.join(args.output_dir, f"{task}.json") + evaluator = CRUD_Evaluator(dataset, output_save_path, task) + if args.ingest_docs: + CRUD_Evaluator.ingest_docs(args.docs_path, args.database_endpoint) + results = evaluator.evaluate( + args, show_progress_bar=args.show_progress_bar, contain_original_data=args.contain_original_data + ) + print(f"Evaluation results of task {task} saved to {output_save_path}.") + + +if __name__ == "__main__": + main() diff --git a/evals/evaluation/rag_eval/examples/process_crud_dataset.py b/evals/evaluation/rag_eval/examples/process_crud_dataset.py new file mode 100644 index 00000000..bb1ca3e4 --- /dev/null +++ b/evals/evaluation/rag_eval/examples/process_crud_dataset.py @@ -0,0 +1,9 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +path = os.path.join(os.path.dirname(__file__), "../data/80000_docs") +for file in os.listdir(path): + src_file = os.path.join(path, file) + os.rename(src_file, src_file + ".txt") diff --git a/evals/evaluation/rag_eval/metrics/__init__.py b/evals/evaluation/rag_eval/metrics/__init__.py new file mode 100644 index 00000000..78b1932f --- /dev/null +++ b/evals/evaluation/rag_eval/metrics/__init__.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from .metrics import LLM_score diff --git a/evals/evaluation/rag_eval/metrics/metrics.py b/evals/evaluation/rag_eval/metrics/metrics.py new file mode 100644 index 00000000..9d52d0fd --- /dev/null +++ b/evals/evaluation/rag_eval/metrics/metrics.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +import json + +import requests + +from .template import CorrelationTemplate + + +def LLM_score(continuation: str, reference: str, llm_endpoint: str) -> float: + if llm_endpoint: + query = CorrelationTemplate.generate_query(continuation, reference) + req = {"inputs": query, "parameters": {"max_new_tokens": 5, "do_sample": False}} + try: + response = requests.post(llm_endpoint, headers={"Content-Type": "application/json"}, data=json.dumps(req)) + response.raise_for_status() + response = response.json() + score = int(response["generated_text"].strip()) + return score + except Exception as e: + print(str(e)) + return 0.0 diff --git a/evals/evaluation/rag_eval/metrics/template.py b/evals/evaluation/rag_eval/metrics/template.py new file mode 100644 index 00000000..f30d4311 --- /dev/null +++ b/evals/evaluation/rag_eval/metrics/template.py @@ -0,0 +1,22 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +class CorrelationTemplate: + @staticmethod + def generate_query(continuation: str, reference: str): + return f"""请你评估以下两个句子的相关性,并给出相关性评分,评分从最低的1到最高的5。 + +请按以下评估步骤进行评估: +1. 仔细阅读给定的两个句子。 +2. 比较两个句子的相关性。 +3. 给出从1到5的相关性评分。 + +以下是句子1: +{reference} + +以下是句子2: +{continuation} + +请按要求给出你的评分: +""" diff --git a/evals/metrics/__init__.py b/evals/metrics/__init__.py index 2d5825c6..e08a2586 100644 --- a/evals/metrics/__init__.py +++ b/evals/metrics/__init__.py @@ -2,3 +2,5 @@ # -*- coding: utf-8 -*- # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 + +from .utils import bleu_score, rougeL_score diff --git a/evals/metrics/bleu/README.md b/evals/metrics/bleu/README.md new file mode 100644 index 00000000..d92598f6 --- /dev/null +++ b/evals/metrics/bleu/README.md @@ -0,0 +1,167 @@ +--- +title: BLEU +emoji: 🤗 +colorFrom: blue +colorTo: red +sdk: gradio +sdk_version: 3.19.1 +app_file: app.py +pinned: false +tags: +- evaluate +- metric +description: >- + BLEU (Bilingual Evaluation Understudy) is an algorithm for evaluating the quality of text which has been machine-translated from one natural language to another. + Quality is considered to be the correspondence between a machine's output and that of a human: "the closer a machine translation is to a professional human translation, the better it is" + – this is the central idea behind BLEU. BLEU was one of the first metrics to claim a high correlation with human judgements of quality, and remains one of the most popular automated and inexpensive metrics. + + Scores are calculated for individual translated segments—generally sentences—by comparing them with a set of good quality reference translations. + Those scores are then averaged over the whole corpus to reach an estimate of the translation's overall quality. + Neither intelligibility nor grammatical correctness are not taken into account. +--- + +# Metric Card for BLEU + + +## Metric Description +BLEU (Bilingual Evaluation Understudy) is an algorithm for evaluating the quality of text which has been machine-translated from one natural language to another. Quality is considered to be the correspondence between a machine's output and that of a human: "the closer a machine translation is to a professional human translation, the better it is" – this is the central idea behind BLEU. BLEU was one of the first metrics to claim a high correlation with human judgements of quality, and remains one of the most popular automated and inexpensive metrics. + +Scores are calculated for individual translated segments—generally sentences—by comparing them with a set of good quality reference translations. Those scores are then averaged over the whole corpus to reach an estimate of the translation's overall quality. Neither intelligibility nor grammatical correctness are not taken into account. + +## Intended Uses +BLEU and BLEU-derived metrics are most often used for machine translation. + +## How to Use + +This metric takes as input a list of predicted sentences and a list of lists of reference sentences (since each predicted sentence can have multiple references): + +```python +>>> predictions = ["hello there general kenobi", "foo bar foobar"] +>>> references = [ +... ["hello there general kenobi", "hello there !"], +... ["foo bar foobar"] +... ] +>>> bleu = evaluate.load("bleu") +>>> results = bleu.compute(predictions=predictions, references=references) +>>> print(results) +{'bleu': 1.0, 'precisions': [1.0, 1.0, 1.0, 1.0], 'brevity_penalty': 1.0, 'length_ratio': 1.1666666666666667, 'translation_length': 7, 'reference_length': 6} +``` + +### Inputs +- **predictions** (`list` of `str`s): Translations to score. +- **references** (`list` of `list`s of `str`s): references for each translation. +- ** tokenizer** : approach used for standardizing `predictions` and `references`. + The default tokenizer is `tokenizer_13a`, a relatively minimal tokenization approach that is however equivalent to `mteval-v13a`, used by WMT. + This can be replaced by another tokenizer from a source such as [SacreBLEU](https://github.com/mjpost/sacrebleu/tree/master/sacrebleu/tokenizers). + +The default tokenizer is based on whitespace and regexes. It can be replaced by any function that takes a string as input and returns a list of tokens as output. E.g. `word_tokenize()` from [NLTK](https://www.nltk.org/api/nltk.tokenize.html) or pretrained tokenizers from the [Tokenizers library](https://huggingface.co/docs/tokenizers/index). +- **max_order** (`int`): Maximum n-gram order to use when computing BLEU score. Defaults to `4`. +- **smooth** (`boolean`): Whether or not to apply Lin et al. 2004 smoothing. Defaults to `False`. + +### Output Values +- **bleu** (`float`): bleu score +- **precisions** (`list` of `float`s): geometric mean of n-gram precisions, +- **brevity_penalty** (`float`): brevity penalty, +- **length_ratio** (`float`): ratio of lengths, +- **translation_length** (`int`): translation_length, +- **reference_length** (`int`): reference_length + +Output Example: +```python +{ + "bleu": 1.0, + "precisions": [1.0, 1.0, 1.0, 1.0], + "brevity_penalty": 1.0, + "length_ratio": 1.1666666666666667, + "translation_length": 7, + "reference_length": 6, +} +``` + +BLEU's output is always a number between 0 and 1. This value indicates how similar the candidate text is to the reference texts, with values closer to 1 representing more similar texts. Few human translations will attain a score of 1, since this would indicate that the candidate is identical to one of the reference translations. For this reason, it is not necessary to attain a score of 1. Because there are more opportunities to match, adding additional reference translations will increase the BLEU score. + +#### Values from Popular Papers +The [original BLEU paper](https://aclanthology.org/P02-1040/) (Papineni et al. 2002) compares BLEU scores of five different models on the same 500-sentence corpus. These scores ranged from 0.0527 to 0.2571. + +The [Attention is All you Need paper](https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf) (Vaswani et al. 2017) got a BLEU score of 0.284 on the WMT 2014 English-to-German translation task, and 0.41 on the WMT 2014 English-to-French translation task. + +### Examples + +Example where each prediction has 1 reference: +```python +>>> predictions = ["hello there general kenobi","foo bar foobar"] +>>> references = [ +... ["hello there general kenobi"], +... ["foo bar foobar"] +... ] +>>> bleu = evaluate.load("bleu") +>>> results = bleu.compute(predictions=predictions, references=references) +>>> print(results) +{'bleu': 1.0, 'precisions': [1.0, 1.0, 1.0, 1.0], 'brevity_penalty': 1.0, 'length_ratio': 1.0, 'translation_length': 7, 'reference_length': 7} +``` + +Example where the second prediction has 2 references: +```python +>>> predictions = [ +... ["hello there general kenobi", +... ["foo bar foobar"] +... ] +>>> references = [ +... [["hello there general kenobi"], ["hello there!"]], +... [["foo bar foobar"]] +... ] +>>> bleu = evaluate.load("bleu") +>>> results = bleu.compute(predictions=predictions, references=references) +>>> print(results) +{'bleu': 1.0, 'precisions': [1.0, 1.0, 1.0, 1.0], 'brevity_penalty': 1.0, 'length_ratio': 1.1666666666666667, 'translation_length': 7, 'reference_length': 6} +``` + +Example with the word tokenizer from NLTK: +```python +>>> bleu = evaluate.load("bleu") +>>> from nltk.tokenize import word_tokenize +>>> predictions = [ +... ["hello there general kenobi", +... ["foo bar foobar"] +... ] +>>> references = [ +... [["hello there general kenobi"], ["hello there!"]], +... [["foo bar foobar"]] +... ] +>>> results = bleu.compute(predictions=predictions, references=references, tokenizer=word_tokenize) +>>> print(results) +{'bleu': 1.0, 'precisions': [1.0, 1.0, 1.0, 1.0], 'brevity_penalty': 1.0, 'length_ratio': 1.1666666666666667, 'translation_length': 7, 'reference_length': 6} +``` + +## Limitations and Bias +This metric has multiple known limitations: +- BLEU compares overlap in tokens from the predictions and references, instead of comparing meaning. This can lead to discrepancies between BLEU scores and human ratings. +- Shorter predicted translations achieve higher scores than longer ones, simply due to how the score is calculated. A brevity penalty is introduced to attempt to counteract this. +- BLEU scores are not comparable across different datasets, nor are they comparable across different languages. +- BLEU scores can vary greatly depending on which parameters are used to generate the scores, especially when different tokenization and normalization techniques are used. It is therefore not possible to compare BLEU scores generated using different parameters, or when these parameters are unknown. For more discussion around this topic, see the following [issue](https://github.com/huggingface/datasets/issues/137). + +## Citation +```bibtex +@INPROCEEDINGS{Papineni02bleu:a, + author = {Kishore Papineni and Salim Roukos and Todd Ward and Wei-jing Zhu}, + title = {BLEU: a Method for Automatic Evaluation of Machine Translation}, + booktitle = {}, + year = {2002}, + pages = {311--318} +} +@inproceedings{lin-och-2004-orange, + title = "{ORANGE}: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation", + author = "Lin, Chin-Yew and + Och, Franz Josef", + booktitle = "{COLING} 2004: Proceedings of the 20th International Conference on Computational Linguistics", + month = "aug 23{--}aug 27", + year = "2004", + address = "Geneva, Switzerland", + publisher = "COLING", + url = "https://www.aclweb.org/anthology/C04-1072", + pages = "501--507", +} +``` + +## Further References +- This Hugging Face implementation uses [this Tensorflow implementation](https://github.com/tensorflow/nmt/blob/master/nmt/scripts/bleu.py) diff --git a/evals/metrics/bleu/bleu.py b/evals/metrics/bleu/bleu.py new file mode 100644 index 00000000..8c45eb16 --- /dev/null +++ b/evals/metrics/bleu/bleu.py @@ -0,0 +1,131 @@ +# Copyright 2020 The HuggingFace Evaluate Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BLEU metric.""" + +import datasets +import evaluate + +from .nmt_bleu import compute_bleu +from .tokenizer_13a import Tokenizer13a + +_CITATION = """\ +@INPROCEEDINGS{Papineni02bleu:a, + author = {Kishore Papineni and Salim Roukos and Todd Ward and Wei-jing Zhu}, + title = {BLEU: a Method for Automatic Evaluation of Machine Translation}, + booktitle = {}, + year = {2002}, + pages = {311--318} +} +@inproceedings{lin-och-2004-orange, + title = "{ORANGE}: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation", + author = "Lin, Chin-Yew and + Och, Franz Josef", + booktitle = "{COLING} 2004: Proceedings of the 20th International Conference on Computational Linguistics", + month = "aug 23{--}aug 27", + year = "2004", + address = "Geneva, Switzerland", + publisher = "COLING", + url = "https://www.aclweb.org/anthology/C04-1072", + pages = "501--507", +} +""" + +_DESCRIPTION = """\ +BLEU (Bilingual Evaluation Understudy) is an algorithm for evaluating the quality of text which has been machine-translated from one natural language to another. +Quality is considered to be the correspondence between a machine's output and that of a human: "the closer a machine translation is to a professional human translation, the better it is" +– this is the central idea behind BLEU. BLEU was one of the first metrics to claim a high correlation with human judgements of quality, and remains one of the most popular automated and inexpensive metrics. + +Scores are calculated for individual translated segments—generally sentences—by comparing them with a set of good quality reference translations. +Those scores are then averaged over the whole corpus to reach an estimate of the translation's overall quality. +Neither intelligibility nor grammatical correctness are not taken into account. +""" + +_KWARGS_DESCRIPTION = """ +Computes BLEU score of translated segments against one or more references. +Args: + predictions: list of translations to score. + references: list of lists of or just a list of references for each translation. + tokenizer : approach used for tokenizing `predictions` and `references`. + The default tokenizer is `tokenizer_13a`, a minimal tokenization approach that is equivalent to `mteval-v13a`, used by WMT. + This can be replaced by any function that takes a string as input and returns a list of tokens as output. + max_order: Maximum n-gram order to use when computing BLEU score. + smooth: Whether or not to apply Lin et al. 2004 smoothing. +Returns: + 'bleu': bleu score, + 'precisions': geometric mean of n-gram precisions, + 'brevity_penalty': brevity penalty, + 'length_ratio': ratio of lengths, + 'translation_length': translation_length, + 'reference_length': reference_length +Examples: + + >>> predictions = ["hello there general kenobi", "foo bar foobar"] + >>> references = [ + ... ["hello there general kenobi", "hello there!"], + ... ["foo bar foobar"] + ... ] + >>> bleu = evaluate.load("bleu") + >>> results = bleu.compute(predictions=predictions, references=references) + >>> print(results["bleu"]) + 1.0 +""" + + +@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class Bleu(evaluate.Metric): + def _info(self): + return evaluate.MetricInfo( + description=_DESCRIPTION, + citation=_CITATION, + inputs_description=_KWARGS_DESCRIPTION, + features=[ + datasets.Features( + { + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"), + } + ), + datasets.Features( + { + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Value("string", id="sequence"), + } + ), + ], + codebase_urls=["https://github.com/tensorflow/nmt/blob/master/nmt/scripts/bleu.py"], + reference_urls=[ + "https://en.wikipedia.org/wiki/BLEU", + "https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213", + ], + ) + + def _compute(self, predictions, references, tokenizer=Tokenizer13a(), max_order=4, smooth=False): + # if only one reference is provided make sure we still use list of lists + if isinstance(references[0], str): + references = [[ref] for ref in references] + + references = [[tokenizer(r) for r in ref] for ref in references] + predictions = [tokenizer(p) for p in predictions] + score = compute_bleu( + reference_corpus=references, translation_corpus=predictions, max_order=max_order, smooth=smooth + ) + (bleu, precisions, bp, ratio, translation_length, reference_length) = score + return { + "bleu": bleu, + "precisions": precisions, + "brevity_penalty": bp, + "length_ratio": ratio, + "translation_length": translation_length, + "reference_length": reference_length, + } diff --git a/evals/metrics/bleu/nmt_bleu.py b/evals/metrics/bleu/nmt_bleu.py new file mode 100644 index 00000000..5d1b0283 --- /dev/null +++ b/evals/metrics/bleu/nmt_bleu.py @@ -0,0 +1,107 @@ +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Python implementation of BLEU and smooth-BLEU. + +This module provides a Python implementation of BLEU and smooth-BLEU. +Smooth BLEU is computed following the method outlined in the paper: +Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic +evaluation metrics for machine translation. COLING 2004. +""" + +import collections +import math + + +def _get_ngrams(segment, max_order): + """Extracts all n-grams upto a given maximum order from an input segment. + + Args: + segment: text segment from which n-grams will be extracted. + max_order: maximum length in tokens of the n-grams returned by this + methods. + + Returns: + The Counter containing all n-grams upto max_order in segment + with a count of how many times each n-gram occurred. + """ + ngram_counts = collections.Counter() + for order in range(1, max_order + 1): + for i in range(0, len(segment) - order + 1): + ngram = tuple(segment[i : i + order]) + ngram_counts[ngram] += 1 + return ngram_counts + + +def compute_bleu(reference_corpus, translation_corpus, max_order=4, smooth=False): + """Computes BLEU score of translated segments against one or more references. + + Args: + reference_corpus: list of lists of references for each translation. Each + reference should be tokenized into a list of tokens. + translation_corpus: list of translations to score. Each translation + should be tokenized into a list of tokens. + max_order: Maximum n-gram order to use when computing BLEU score. + smooth: Whether or not to apply Lin et al. 2004 smoothing. + + Returns: + 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram + precisions and brevity penalty. + """ + matches_by_order = [0] * max_order + possible_matches_by_order = [0] * max_order + reference_length = 0 + translation_length = 0 + for references, translation in zip(reference_corpus, translation_corpus): + reference_length += min(len(r) for r in references) + translation_length += len(translation) + + merged_ref_ngram_counts = collections.Counter() + for reference in references: + merged_ref_ngram_counts |= _get_ngrams(reference, max_order) + translation_ngram_counts = _get_ngrams(translation, max_order) + overlap = translation_ngram_counts & merged_ref_ngram_counts + for ngram in overlap: + matches_by_order[len(ngram) - 1] += overlap[ngram] + for order in range(1, max_order + 1): + possible_matches = len(translation) - order + 1 + if possible_matches > 0: + possible_matches_by_order[order - 1] += possible_matches + + precisions = [0] * max_order + for i in range(0, max_order): + if smooth: + precisions[i] = (matches_by_order[i] + 1.0) / (possible_matches_by_order[i] + 1.0) + else: + if possible_matches_by_order[i] > 0: + precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[i] + else: + precisions[i] = 0.0 + + if min(precisions) > 0: + p_log_sum = sum((1.0 / max_order) * math.log(p) for p in precisions) + geo_mean = math.exp(p_log_sum) + else: + geo_mean = 0 + + ratio = float(translation_length) / reference_length + + if ratio > 1.0: + bp = 1.0 + else: + bp = math.exp(1 - 1.0 / ratio) + + bleu = geo_mean * bp + + return (bleu, precisions, bp, ratio, translation_length, reference_length) diff --git a/evals/metrics/bleu/tokenizer_13a.py b/evals/metrics/bleu/tokenizer_13a.py new file mode 100644 index 00000000..aa0cebf7 --- /dev/null +++ b/evals/metrics/bleu/tokenizer_13a.py @@ -0,0 +1,101 @@ +# Source: https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/tokenizers/tokenizer_13a.py +# Copyright 2020 SacreBLEU Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from functools import lru_cache + + +class BaseTokenizer: + """A base dummy tokenizer to derive from.""" + + def signature(self): + """Returns a signature for the tokenizer. + + :return: signature string + """ + return "none" + + def __call__(self, line): + """Tokenizes an input line with the tokenizer. + + :param line: a segment to tokenize + :return: the tokenized line + """ + return line + + +class TokenizerRegexp(BaseTokenizer): + def signature(self): + return "re" + + def __init__(self): + self._re = [ + # language-dependent part (assuming Western languages) + (re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), r" \1 "), + # tokenize period and comma unless preceded by a digit + (re.compile(r"([^0-9])([\.,])"), r"\1 \2 "), + # tokenize period and comma unless followed by a digit + (re.compile(r"([\.,])([^0-9])"), r" \1 \2"), + # tokenize dash when preceded by a digit + (re.compile(r"([0-9])(-)"), r"\1 \2 "), + # one space only between words + # NOTE: Doing this in Python (below) is faster + # (re.compile(r'\s+'), r' '), + ] + + @lru_cache(maxsize=2**16) + def __call__(self, line): + """Common post-processing tokenizer for `13a` and `zh` tokenizers. + + :param line: a segment to tokenize + :return: the tokenized line + """ + for _re, repl in self._re: + line = _re.sub(repl, line) + + # no leading or trailing spaces, single space within words + # return ' '.join(line.split()) + # This line is changed with regards to the original tokenizer (seen above) to return individual words + return line.split() + + +class Tokenizer13a(BaseTokenizer): + def signature(self): + return "13a" + + def __init__(self): + self._post_tokenizer = TokenizerRegexp() + + @lru_cache(maxsize=2**16) + def __call__(self, line): + """Tokenizes an input line using a relatively minimal tokenization + that is however equivalent to mteval-v13a, used by WMT. + + :param line: a segment to tokenize + :return: the tokenized line + """ + + # language-independent part: + line = line.replace("", "") + line = line.replace("-\n", "") + line = line.replace("\n", " ") + + if "&" in line: + line = line.replace(""", '"') + line = line.replace("&", "&") + line = line.replace("<", "<") + line = line.replace(">", ">") + + return self._post_tokenizer(f" {line} ") diff --git a/evals/metrics/rouge/rouge.py b/evals/metrics/rouge/rouge.py new file mode 100644 index 00000000..61c8d113 --- /dev/null +++ b/evals/metrics/rouge/rouge.py @@ -0,0 +1,156 @@ +# Copyright 2020 The HuggingFace Evaluate Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ROUGE metric from Google Research github repo.""" + +# The dependencies in https://github.com/google-research/google-research/blob/master/rouge/requirements.txt +import absl # Here to have a nice missing dependency error message early on +import datasets +import evaluate +import nltk # Here to have a nice missing dependency error message early on +import numpy # Here to have a nice missing dependency error message early on +import six # Here to have a nice missing dependency error message early on +from rouge_score import rouge_scorer, scoring + +_CITATION = """\ +@inproceedings{lin-2004-rouge, + title = "{ROUGE}: A Package for Automatic Evaluation of Summaries", + author = "Lin, Chin-Yew", + booktitle = "Text Summarization Branches Out", + month = jul, + year = "2004", + address = "Barcelona, Spain", + publisher = "Association for Computational Linguistics", + url = "https://www.aclweb.org/anthology/W04-1013", + pages = "74--81", +} +""" + +_DESCRIPTION = """\ +ROUGE, or Recall-Oriented Understudy for Gisting Evaluation, is a set of metrics and a software package used for +evaluating automatic summarization and machine translation software in natural language processing. +The metrics compare an automatically produced summary or translation against a reference or a set of references (human-produced) summary or translation. + +Note that ROUGE is case insensitive, meaning that upper case letters are treated the same way as lower case letters. + +This metrics is a wrapper around Google Research reimplementation of ROUGE: +https://github.com/google-research/google-research/tree/master/rouge +""" + +_KWARGS_DESCRIPTION = """ +Calculates average rouge scores for a list of hypotheses and references +Args: + predictions: list of predictions to score. Each prediction + should be a string with tokens separated by spaces. + references: list of reference for each prediction. Each + reference should be a string with tokens separated by spaces. + rouge_types: A list of rouge types to calculate. + Valid names: + `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring, + `"rougeL"`: Longest common subsequence based scoring. + `"rougeLsum"`: rougeLsum splits text using `"\n"`. + See details in https://github.com/huggingface/datasets/issues/617 + use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes. + use_aggregator: Return aggregates if this is set to True +Returns: + rouge1: rouge_1 (f1), + rouge2: rouge_2 (f1), + rougeL: rouge_l (f1), + rougeLsum: rouge_lsum (f1) +Examples: + + >>> rouge = evaluate.load('rouge') + >>> predictions = ["hello there", "general kenobi"] + >>> references = ["hello there", "general kenobi"] + >>> results = rouge.compute(predictions=predictions, references=references) + >>> print(results) + {'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0} +""" + + +class Tokenizer: + """Helper class to wrap a callable into a class with a `tokenize` method as used by rouge-score.""" + + def __init__(self, tokenizer_func): + self.tokenizer_func = tokenizer_func + + def tokenize(self, text): + return self.tokenizer_func(text) + + +@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class Rouge(evaluate.Metric): + def _info(self): + return evaluate.MetricInfo( + description=_DESCRIPTION, + citation=_CITATION, + inputs_description=_KWARGS_DESCRIPTION, + features=[ + datasets.Features( + { + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Sequence(datasets.Value("string", id="sequence")), + } + ), + datasets.Features( + { + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Value("string", id="sequence"), + } + ), + ], + codebase_urls=["https://github.com/google-research/google-research/tree/master/rouge"], + reference_urls=[ + "https://en.wikipedia.org/wiki/ROUGE_(metric)", + "https://github.com/google-research/google-research/tree/master/rouge", + ], + ) + + def _compute( + self, predictions, references, rouge_types=None, use_aggregator=True, use_stemmer=False, tokenizer=None + ): + if rouge_types is None: + rouge_types = ["rouge1", "rouge2", "rougeL", "rougeLsum"] + + multi_ref = isinstance(references[0], list) + + if tokenizer is not None: + tokenizer = Tokenizer(tokenizer) + + scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=use_stemmer, tokenizer=tokenizer) + if use_aggregator: + aggregator = scoring.BootstrapAggregator() + else: + scores = [] + + for ref, pred in zip(references, predictions): + if multi_ref: + score = scorer.score_multi(ref, pred) + else: + score = scorer.score(ref, pred) + if use_aggregator: + aggregator.add_scores(score) + else: + scores.append(score) + + if use_aggregator: + result = aggregator.aggregate() + for key in result: + result[key] = result[key].mid.fmeasure + + else: + result = {} + for key in scores[0]: + result[key] = list(score[key].fmeasure for score in scores) + + return result diff --git a/evals/metrics/utils.py b/evals/metrics/utils.py index cab4bede..42711047 100644 --- a/evals/metrics/utils.py +++ b/evals/metrics/utils.py @@ -2,8 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 import json +import os from typing import Any, List, Optional, Tuple, Union +import evaluate from pydantic import BaseModel @@ -69,3 +71,40 @@ def print_verbose_logs(metric: str, logs: str): print(logs) print("") print("=" * 70) + + +def catch_all_exceptions(func): + def wrapper(*args, **kwargs): + try: + result = func(*args, **kwargs) + return result + except Exception as e: + print(repr(e)) + + return wrapper + + +@catch_all_exceptions +def bleu_score(continuation: str, reference: str, with_penalty=False) -> float: + bleu = evaluate.load(os.path.join(os.path.dirname(__file__), "bleu")) + results = bleu.compute(predictions=[continuation], references=[[reference]]) + + bleu_avg = results["bleu"] + bleu1 = results["precisions"][0] + bleu2 = results["precisions"][1] + bleu3 = results["precisions"][2] + bleu4 = results["precisions"][3] + brevity_penalty = results["brevity_penalty"] + + if with_penalty: + return bleu_avg, bleu1, bleu2, bleu3, bleu4 + else: + return 0.0 if brevity_penalty == 0 else bleu_avg / brevity_penalty, bleu1, bleu2, bleu3, bleu4 + + +@catch_all_exceptions +def rougeL_score(continuation: str, reference: str) -> float: + rouge = evaluate.load(os.path.join(os.path.dirname(__file__), "rouge")) + results = rouge.compute(predictions=[continuation], references=[[reference]], rouge_types=["rougeL"]) + score = results["rougeL"] + return score diff --git a/requirements.txt b/requirements.txt index 02087cf8..2e7e1b59 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ bigcode-eval@git+https://github.com/bigcode-project/bigcode-evaluation-harness.git@a1b4a7949a24c8e3ef0d05a01097b2d14ffba56e +evaluate langchain_community langchain_huggingface lm-eval==0.4.3 ragas +rouge_score