diff --git a/get_pearson_corr_biosses.py b/get_pearson_corr_biosses.py new file mode 100644 index 0000000000..2483f82e04 --- /dev/null +++ b/get_pearson_corr_biosses.py @@ -0,0 +1,62 @@ +"""Hacky script to get pearson's correlation +2022.06.04 +""" +import os +from pathlib import Path +from scipy.stats import pearsonr +import json + +if __name__ == "__main__": + + # path to lm-eval and outputs + dpath = "/home/natasha/Projects/hfbiomed/full_prompting_pipeline/lm-evaluation-harness/outputs" + dataset = "biosses" + + # Get all files with a dataset + fnames = Path(dpath).glob("*" + dataset + "*") + fnames = [i for i in fnames if "examples-" in i.__str__()] + + # Get the latest result + latest_result = sorted(fnames, key=os.path.getmtime)[-1] + + with open(latest_result, 'r') as json_file: + json_list = list(json_file) + + + # Update only the tasks with float-type predictions + tasks = {} + for json_str in json_list: + result = json.loads(json_str) + + task = result.get("prompt_name", None) + + if result.get("prompt_name", None) is not None: + + if task not in tasks: + tasks.update( + {task: {"pred": [], "target": []} } + ) + + pred = result.get("pred", None) + target = result.get("target", None) + + if (pred is not None) and (target is not None): + try: + tasks[task]["pred"].append(float(pred)) + tasks[task]["target"].append(float(target)) + except ValueError: + pass + + # For each task with non-zero pred/targets, compute Pearson-R + lines = [] + row = ["Task", "Correlation", "P-value"] + lines.append(",".join(row) + "\n") + for task in tasks: + if len(tasks[task]["pred"]): + corr, pval = pearsonr(tasks[task]["pred"], tasks[task]["target"]) + row = [task, str(round(corr, 3)), str(pval)[:5]] + lines.append(" ".join(row).rjust(20, " ") + "\n") + lines.append(",".join(row) + "\n") + + with open(dataset + "_results.txt", "w") as f: + f.writelines(lines) diff --git a/instructions.md b/instructions.md new file mode 100644 index 0000000000..72749df0bc --- /dev/null +++ b/instructions.md @@ -0,0 +1,162 @@ +## Setting up BioPrompting +**2022.06.03** + +In order to get a fresh install, I did the following: + +### Create a fresh conda environment + +I created a new conda environment with the following commands: + +``` +conda create -n bbprompt python=3.9 +conda activate bbprompt +``` + + +### Install the bigbio fork of promptsource + +I installed promptsource as such: + +``` +git clone https://github.com/OpenBioLink/promptsource +cd promptsource +pip install -e . +cd ../ +``` + +You may want to fork your own version and install the remote fork. + + +### Install LM-Eval + +Install specifically the bigbio version of LM-Eval. You can do so as follows: + +``` +git clone git@github.com:bigscience-workshop/lm-evaluation-harness.git +cd lm-evaluation-harness +git checkout bigbio +git pull origin bigbio +pip install -e . +cd .. +``` + +### Install the most recent BigBio dataloaders dataset + +Install the main branch of bigbio: + +``` +git clone git@github.com:bigscience-workshop/biomedical.git +cd biomedical +pip install -e . +cd .. +``` + +### Creating a custom prompt + +**Make sure that in your promptsource installation, a corresponding template exists!** + +For this template to exist, you will find them here: `promptsource/promptsource/templates`. The file itself should be called `templates.yaml` and should be generated via the streamlit app (make sure protobuf <= 3.20.X). The folder the template should be in the following structure: + +`promptsource/templates/your_dataset_name/your_dataset_name_bigbio_schema + +where `your_dataset_name` and `schema` are replaced to the name of the dataset and the specific config you wish to use. + +##### Create a new task + +Create a new task with the following format filled in: + +Note, you will not get results if your data does not have a validation + test set. A crappy hack is to return validation_docs and/or test_docs as the train set. + +Place a file `yourdataset.py` in `lm-evaluation-harness/lm_eval/tasks` that fills out the criteria below: + +```python +from lm_eval.base import BioTask + +_CITATION = """ +PLACE_YOUR_CITATION_HERE +""" + + +class YourDatasetBase(BioTask): + VERSION = 0 + DATASET_PATH = "path/to/dataloader/script/from/bigbio" + DATASET_NAME = None + SPLIT = None + + # Fill these out as T/F depending on your dataset + def has_training_docs(self): + return True + + def has_validation_docs(self): + return True + + def has_test_docs(self): + return True + + def training_docs(self): + if self.has_training_docs(): + return self.dataset["train"] + + def validation_docs(self): + if self.has_validation_docs(): + return self.dataset["validation"] + + def test_docs(self): + if self.has_test_docs(): + return self.dataset["test"] # you can replace with `train` to hack around + + +class YourDatasetSplit(YourDatasetBase): + DATASET_NAME = "yourdataset_bigbio_" +``` + +Add this dataset task to `lm-evaluation-harness/lm_eval/tasks/__init__.py` by adding the following lines: + +```python +from . import yourdataset # Place this in the beginning import + +# Within TASK_REGISTRY, add the following command +TASK_REGISTRY = { + ... + "your_dataset_name": yourdataset.Class_Corresponding_To_Schema +} +``` + +(For example, BIOSSES would look as such:) +```python + "biosses": biosses.BiossesPairs +``` + +### Getting your task to run + +In order to get lm-eval to run, I made the following changes to `lm-evaluation-harness/lm_eval/base.py`. + +1) [L20](https://github.com/bigscience-workshop/lm-evaluation-harness/blob/ea1afe62423c4ff75d6579ccc6942ad8b5138298/lm_eval/base.py#L720): change `target = self.doc_to_target(doc)` to `target = [self.doc_to_target(doc)]` This seemed to give me an issue that returned on str as opposed to a List of str. +
+2) [L1055, L1056](https://github.com/bigscience-workshop/lm-evaluation-harness/blob/ea1afe62423c4ff75d6579ccc6942ad8b5138298/lm_eval/base.py#L1055): Edit `CONFIGURED_RANKED_CHOICE_PS_METRICS` or `CONFIGURED_GENERATION_PS_METRICS` to include your custom task metric (found in your `template.yaml` file prompts under `metrics`.) + +### Implementing a Custom Metric + +In cases where you may need to implement a custom metric, you will need to write a custom function in `lm-evaluation-harness/lm_eval/metrics.py`. More advanced implementations can exist in `lm-evaluation-harness/lm_eval/metrics_impls`. + +**NOTE** If you are working with numerical information (I.e. correlation etc) make sure you have answer choices. If your answer choices are `null` in your prompt, you will go into the "generation" part of the code which may not be useful. + +Next, ensure your task has an output in `aggregation` from in `lm-evaluation-harness/lm_eval/base.py`. +In `lm-evaluation-harness/lm_eval/evaluator.py`, the actual eval code is executed. Changes can be made around 265 to change your logic. + +# Running your Task + +If you implemented the above successfully, your command should run as follows: +``` +python main.py --model hf-seq2seq --model_args pretrained=t5-small --tasks yourdataset --device cpu +``` + +**If you want to run GAD's BLURB set, try:**
+``` +python main.py --model hf-seq2seq --model_args pretrained=t5-small --tasks gad --device cpu +``` + +**If you want to run BIOSSES's BLURB set, try:**
+``` +python main.py --model hf-seq2seq --model_args pretrained=t5-small --tasks biosses --device cpu +``` diff --git a/lm_eval/base.py b/lm_eval/base.py index 55db8af543..ba7ffbcdab 100644 --- a/lm_eval/base.py +++ b/lm_eval/base.py @@ -717,7 +717,7 @@ def process_results(self, doc, results): The results of the requests created in construct_requests. """ answer_choices_list = self.prompt.get_answer_choices_list(doc) - target = self.doc_to_target(doc) + target = [self.doc_to_target(doc)] # N.SEELAM The target is a str, not a list of strs if answer_choices_list: # If answer_choices_list, then this is a ranked choice prompt. # NOTE: In the future, target could be a list of strings. @@ -1052,7 +1052,7 @@ class BioTask(PromptSourceTask): *and* add additional custom processing, override `process_results`, `higher_is_better`, and `aggregation`. """ - CONFIGURED_RANKED_CHOICE_PS_METRICS = set(["Accuracy"]) + CONFIGURED_RANKED_CHOICE_PS_METRICS = set(["Accuracy", "Other"]) CONFIGURED_GENERATION_PS_METRICS = set(["BLEU", "ROUGE", "SARI"]) SPLIT = None diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py index a8fc9c31dc..03ecbb0f6a 100644 --- a/lm_eval/tasks/__init__.py +++ b/lm_eval/tasks/__init__.py @@ -27,7 +27,8 @@ from . import wmt from . import cnn_dailymail from . import diabla - +from . import gad +from . import biosses ######################################## # All tasks @@ -199,6 +200,23 @@ # SciTail "scitail": scitail.SciTailTE, + + # All GAD Datasets + "gad0": gad.GadFold0Text, + "gad1": gad.GadFold1Text, + "gad2": gad.GadFold2Text, + "gad3": gad.GadFold3Text, + "gad4": gad.GadFold4Text, + "gad5": gad.GadFold5Text, + "gad6": gad.GadFold6Text, + "gad7": gad.GadFold7Text, + "gad8": gad.GadFold8Text, + "gad9": gad.GadFold9Text, + "gad": gad.GadBlurbText, + + # Biosses + "biosses": biosses.BiossesPairs, + } diff --git a/lm_eval/tasks/biosses.py b/lm_eval/tasks/biosses.py new file mode 100644 index 0000000000..6e434dc743 --- /dev/null +++ b/lm_eval/tasks/biosses.py @@ -0,0 +1,50 @@ +""" +BIOSSES: +NOTE: A stupid hack I am doing is forcing a "validation" set as this only has training data implemented. +""" +from lm_eval.base import BioTask + +_CITATION = """ +@article{souganciouglu2017biosses, + title={BIOSSES: a semantic sentence similarity estimation system for the biomedical domain}, + author={Soğancıoğlu, Gizem, Hakime Öztürk, and Arzucan Özgür}, + journal={Bioinformatics}, + volume={33}, + number={14}, + pages={i49--i58}, + year={2017}, + publisher={Oxford University Press} +} +""" + + +class BiossesBase(BioTask): + VERSION = 0 + DATASET_PATH = "/home/natasha/Projects/hfbiomed/full_prompting_pipeline/biomedical/bigbio/biodatasets/biosses" + DATASET_NAME = None + SPLIT = None + + def has_training_docs(self): + return True + + def has_validation_docs(self): + return True + + def has_test_docs(self): + return True + + def training_docs(self): + if self.has_training_docs(): + return self.dataset["train"] + + def validation_docs(self): + if self.has_validation_docs(): + return self.dataset["validation"] + + def test_docs(self): + if self.has_test_docs(): + return self.dataset["test"] + + +class BiossesPairs(BiossesBase): + DATASET_NAME = "biosses_bigbio_pairs" \ No newline at end of file diff --git a/lm_eval/tasks/gad.py b/lm_eval/tasks/gad.py new file mode 100644 index 0000000000..f1f3ea5161 --- /dev/null +++ b/lm_eval/tasks/gad.py @@ -0,0 +1,82 @@ +""" +Gad: A corpus identifying associations between genes and diseases by a semi-automatic annotation procedure based on the Genetic Association Database +Homepage: "https://github.com/dmis-lab/biobert" +""" +from lm_eval.base import BioTask + +_CITATION = """ +@article{Bravo2015, + doi = {10.1186/s12859-015-0472-9}, + url = {https://doi.org/10.1186/s12859-015-0472-9}, + year = {2015}, + month = feb, + publisher = {Springer Science and Business Media {LLC}}, + volume = {16}, + number = {1}, + author = {{\`{A}}lex Bravo and Janet Pi{\~{n}}ero and N{\'{u}}ria Queralt-Rosinach and Michael Rautschka and Laura I Furlong}, + title = {Extraction of relations between genes and diseases from text and large-scale data analysis: implications for translational research}, + journal = {{BMC} Bioinformatics} +} +""" + + +class GadBase(BioTask): + VERSION = 0 + DATASET_PATH = "/home/natasha/Projects/hfbiomed/full_prompting_pipeline/biomedical/bigbio/biodatasets/gad" + DATASET_NAME = None + SPLIT = None + + def has_training_docs(self): + return True + + def has_validation_docs(self): + return True + + def has_test_docs(self): + return True + + def training_docs(self): + if self.has_training_docs(): + return self.dataset["train"] + + def validation_docs(self): + if self.has_validation_docs(): + return self.dataset["validation"] + + def test_docs(self): + if self.has_test_docs(): + return self.dataset["test"] + +class GadBlurbText(GadBase): + """BLURB split from GAD, based on fold1""" + DATASET_NAME = "gad_blurb_bigbio_text" + +class GadFold0Text(GadBase): + DATASET_NAME = "gad_fold0_bigbio_text" + +class GadFold1Text(GadBase): + DATASET_NAME = "gad_fold1_bigbio_text" + +class GadFold2Text(GadBase): + DATASET_NAME = "gad_fold2_bigbio_text" + +class GadFold3Text(GadBase): + DATASET_NAME = "gad_fold3_bigbio_text" + +class GadFold4Text(GadBase): + DATASET_NAME = "gad_fold4_bigbio_text" + +class GadFold5Text(GadBase): + DATASET_NAME = "gad_fold5_bigbio_text" + +class GadFold6Text(GadBase): + DATASET_NAME = "gad_fold6_bigbio_text" + +class GadFold7Text(GadBase): + DATASET_NAME = "gad_fold7_bigbio_text" + +class GadFold8Text(GadBase): + DATASET_NAME = "gad_fold8_bigbio_text" + +class GadFold9Text(GadBase): + DATASET_NAME = "gad_fold9_bigbio_text" \ No newline at end of file