From e7962d432b055431fc5f29510af8667fabee8070 Mon Sep 17 00:00:00 2001 From: Natasha Seelam Date: Sat, 4 Jun 2022 22:16:21 -0400 Subject: [PATCH 01/10] fix GAD splits --- lm_eval/tasks/gad.py | 79 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 lm_eval/tasks/gad.py diff --git a/lm_eval/tasks/gad.py b/lm_eval/tasks/gad.py new file mode 100644 index 0000000000..c65c4c280a --- /dev/null +++ b/lm_eval/tasks/gad.py @@ -0,0 +1,79 @@ +""" +Gad: A corpus identifying associations between genes and diseases by a semi-automatic annotation procedure based on the Genetic Association Database +Homepage: "https://github.com/dmis-lab/biobert" +""" +from lm_eval.base import BioTask + +_CITATION = """ +@article{Bravo2015, + doi = {10.1186/s12859-015-0472-9}, + url = {https://doi.org/10.1186/s12859-015-0472-9}, + year = {2015}, + month = feb, + publisher = {Springer Science and Business Media {LLC}}, + volume = {16}, + number = {1}, + author = {{\`{A}}lex Bravo and Janet Pi{\~{n}}ero and N{\'{u}}ria Queralt-Rosinach and Michael Rautschka and Laura I Furlong}, + title = {Extraction of relations between genes and diseases from text and large-scale data analysis: implications for translational research}, + journal = {{BMC} Bioinformatics} +} +""" + + +class GadBase(BioTask): + VERSION = 0 + DATASET_PATH = "/home/natasha/Projects/hfbiomed/blurb_datasets/gad" + DATASET_NAME = None + SPLIT = None + + def has_training_docs(self): + return True + + def has_validation_docs(self): + return True + + def has_test_docs(self): + return True + + def training_docs(self): + if self.has_training_docs(): + return self.dataset["train"] + + def validation_docs(self): + if self.has_validation_docs(): + return self.dataset["validation"] + + def test_docs(self): + if self.has_test_docs(): + return self.dataset["test"] + + +class GadFold0Text(GadBase): + DATASET_NAME = "gad_fold0_bigbio_text" + +class GadFold1Text(GadBase): + DATASET_NAME = "gad_fold1_bigbio_text" + +class GadFold2Text(GadBase): + DATASET_NAME = "gad_fold2_bigbio_text" + +class GadFold3Text(GadBase): + DATASET_NAME = "gad_fold3_bigbio_text" + +class GadFold4Text(GadBase): + DATASET_NAME = "gad_fold4_bigbio_text" + +class GadFold5Text(GadBase): + DATASET_NAME = "gad_fold5_bigbio_text" + +class GadFold6Text(GadBase): + DATASET_NAME = "gad_fold6_bigbio_text" + +class GadFold7Text(GadBase): + DATASET_NAME = "gad_fold7_bigbio_text" + +class GadFold8Text(GadBase): + DATASET_NAME = "gad_fold8_bigbio_text" + +class GadFold9Text(GadBase): + DATASET_NAME = "gad_fold9_bigbio_text" \ No newline at end of file From f67980f3f01fa0ef292d44b69ed28f6b94c9ea04 Mon Sep 17 00:00:00 2001 From: Natasha Seelam Date: Sat, 4 Jun 2022 22:21:42 -0400 Subject: [PATCH 02/10] add all gad splits --- lm_eval/tasks/__init__.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py index a8fc9c31dc..46ead26713 100644 --- a/lm_eval/tasks/__init__.py +++ b/lm_eval/tasks/__init__.py @@ -27,7 +27,7 @@ from . import wmt from . import cnn_dailymail from . import diabla - +from . import gad ######################################## # All tasks @@ -199,6 +199,20 @@ # SciTail "scitail": scitail.SciTailTE, + + # All GAD Datasets + "gad0": gad.GadFold0Text, + "gad1": gad.GadFold1Text, + "gad2": gad.GadFold2Text, + "gad3": gad.GadFold3Text, + "gad4": gad.GadFold4Text, + "gad5": gad.GadFold5Text, + "gad6": gad.GadFold6Text, + "gad7": gad.GadFold7Text, + "gad8": gad.GadFold8Text, + "gad9": gad.GadFold9Text, + + } From 11d36c0a6bcc3e1c785483e4ee8a0d9cef81b49c Mon Sep 17 00:00:00 2001 From: Natasha Seelam Date: Sat, 4 Jun 2022 22:23:04 -0400 Subject: [PATCH 03/10] fix: 720 target should be a list of str not str only --- lm_eval/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lm_eval/base.py b/lm_eval/base.py index 55db8af543..ba7ffbcdab 100644 --- a/lm_eval/base.py +++ b/lm_eval/base.py @@ -717,7 +717,7 @@ def process_results(self, doc, results): The results of the requests created in construct_requests. """ answer_choices_list = self.prompt.get_answer_choices_list(doc) - target = self.doc_to_target(doc) + target = [self.doc_to_target(doc)] # N.SEELAM The target is a str, not a list of strs if answer_choices_list: # If answer_choices_list, then this is a ranked choice prompt. # NOTE: In the future, target could be a list of strings. @@ -1052,7 +1052,7 @@ class BioTask(PromptSourceTask): *and* add additional custom processing, override `process_results`, `higher_is_better`, and `aggregation`. """ - CONFIGURED_RANKED_CHOICE_PS_METRICS = set(["Accuracy"]) + CONFIGURED_RANKED_CHOICE_PS_METRICS = set(["Accuracy", "Other"]) CONFIGURED_GENERATION_PS_METRICS = set(["BLEU", "ROUGE", "SARI"]) SPLIT = None From 2f5c507cf8fcea5783d316fac38f8e84083c55ef Mon Sep 17 00:00:00 2001 From: Natasha Seelam Date: Sat, 4 Jun 2022 23:19:15 -0400 Subject: [PATCH 04/10] adds gadblurb task --- lm_eval/tasks/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py index 46ead26713..206e8270f9 100644 --- a/lm_eval/tasks/__init__.py +++ b/lm_eval/tasks/__init__.py @@ -211,6 +211,9 @@ "gad7": gad.GadFold7Text, "gad8": gad.GadFold8Text, "gad9": gad.GadFold9Text, + "gadblurb": gad.GadBlurbText, + + # Biosses } From 30e5e0b265b349c5df8edfdc341cf528b4326095 Mon Sep 17 00:00:00 2001 From: Natasha Seelam Date: Sat, 4 Jun 2022 23:19:26 -0400 Subject: [PATCH 05/10] adds gadblurb task --- lm_eval/tasks/gad.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lm_eval/tasks/gad.py b/lm_eval/tasks/gad.py index c65c4c280a..f1f3ea5161 100644 --- a/lm_eval/tasks/gad.py +++ b/lm_eval/tasks/gad.py @@ -22,7 +22,7 @@ class GadBase(BioTask): VERSION = 0 - DATASET_PATH = "/home/natasha/Projects/hfbiomed/blurb_datasets/gad" + DATASET_PATH = "/home/natasha/Projects/hfbiomed/full_prompting_pipeline/biomedical/bigbio/biodatasets/gad" DATASET_NAME = None SPLIT = None @@ -47,6 +47,9 @@ def test_docs(self): if self.has_test_docs(): return self.dataset["test"] +class GadBlurbText(GadBase): + """BLURB split from GAD, based on fold1""" + DATASET_NAME = "gad_blurb_bigbio_text" class GadFold0Text(GadBase): DATASET_NAME = "gad_fold0_bigbio_text" From 96c840e4423261eab6f4f337215022b48abb5289 Mon Sep 17 00:00:00 2001 From: Natasha Seelam Date: Sat, 4 Jun 2022 23:20:25 -0400 Subject: [PATCH 06/10] change default name to gad --- lm_eval/tasks/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py index 206e8270f9..f2e74e4823 100644 --- a/lm_eval/tasks/__init__.py +++ b/lm_eval/tasks/__init__.py @@ -211,7 +211,7 @@ "gad7": gad.GadFold7Text, "gad8": gad.GadFold8Text, "gad9": gad.GadFold9Text, - "gadblurb": gad.GadBlurbText, + "gad": gad.GadBlurbText, # Biosses From 19d38363329355833487f14c982dbd49aeb8525b Mon Sep 17 00:00:00 2001 From: Natasha Seelam Date: Sat, 4 Jun 2022 23:22:56 -0400 Subject: [PATCH 07/10] adds biosses tasks --- lm_eval/tasks/biosses.py | 50 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 lm_eval/tasks/biosses.py diff --git a/lm_eval/tasks/biosses.py b/lm_eval/tasks/biosses.py new file mode 100644 index 0000000000..6e434dc743 --- /dev/null +++ b/lm_eval/tasks/biosses.py @@ -0,0 +1,50 @@ +""" +BIOSSES: +NOTE: A stupid hack I am doing is forcing a "validation" set as this only has training data implemented. +""" +from lm_eval.base import BioTask + +_CITATION = """ +@article{souganciouglu2017biosses, + title={BIOSSES: a semantic sentence similarity estimation system for the biomedical domain}, + author={Soğancıoğlu, Gizem, Hakime Öztürk, and Arzucan Özgür}, + journal={Bioinformatics}, + volume={33}, + number={14}, + pages={i49--i58}, + year={2017}, + publisher={Oxford University Press} +} +""" + + +class BiossesBase(BioTask): + VERSION = 0 + DATASET_PATH = "/home/natasha/Projects/hfbiomed/full_prompting_pipeline/biomedical/bigbio/biodatasets/biosses" + DATASET_NAME = None + SPLIT = None + + def has_training_docs(self): + return True + + def has_validation_docs(self): + return True + + def has_test_docs(self): + return True + + def training_docs(self): + if self.has_training_docs(): + return self.dataset["train"] + + def validation_docs(self): + if self.has_validation_docs(): + return self.dataset["validation"] + + def test_docs(self): + if self.has_test_docs(): + return self.dataset["test"] + + +class BiossesPairs(BiossesBase): + DATASET_NAME = "biosses_bigbio_pairs" \ No newline at end of file From 06e8d4307db143a21d00b0f9b6f32cb10b856326 Mon Sep 17 00:00:00 2001 From: Natasha Seelam Date: Sat, 4 Jun 2022 23:24:16 -0400 Subject: [PATCH 08/10] adds biosses task and init --- lm_eval/tasks/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py index f2e74e4823..03ecbb0f6a 100644 --- a/lm_eval/tasks/__init__.py +++ b/lm_eval/tasks/__init__.py @@ -28,6 +28,7 @@ from . import cnn_dailymail from . import diabla from . import gad +from . import biosses ######################################## # All tasks @@ -214,7 +215,7 @@ "gad": gad.GadBlurbText, # Biosses - + "biosses": biosses.BiossesPairs, } From 042ef9c42a63191bcd84306cc1605ab760e72b9d Mon Sep 17 00:00:00 2001 From: Natasha Seelam Date: Sun, 5 Jun 2022 00:00:05 -0400 Subject: [PATCH 09/10] adds hacky script for correlation --- get_pearson_corr_biosses.py | 62 +++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 get_pearson_corr_biosses.py diff --git a/get_pearson_corr_biosses.py b/get_pearson_corr_biosses.py new file mode 100644 index 0000000000..2483f82e04 --- /dev/null +++ b/get_pearson_corr_biosses.py @@ -0,0 +1,62 @@ +"""Hacky script to get pearson's correlation +2022.06.04 +""" +import os +from pathlib import Path +from scipy.stats import pearsonr +import json + +if __name__ == "__main__": + + # path to lm-eval and outputs + dpath = "/home/natasha/Projects/hfbiomed/full_prompting_pipeline/lm-evaluation-harness/outputs" + dataset = "biosses" + + # Get all files with a dataset + fnames = Path(dpath).glob("*" + dataset + "*") + fnames = [i for i in fnames if "examples-" in i.__str__()] + + # Get the latest result + latest_result = sorted(fnames, key=os.path.getmtime)[-1] + + with open(latest_result, 'r') as json_file: + json_list = list(json_file) + + + # Update only the tasks with float-type predictions + tasks = {} + for json_str in json_list: + result = json.loads(json_str) + + task = result.get("prompt_name", None) + + if result.get("prompt_name", None) is not None: + + if task not in tasks: + tasks.update( + {task: {"pred": [], "target": []} } + ) + + pred = result.get("pred", None) + target = result.get("target", None) + + if (pred is not None) and (target is not None): + try: + tasks[task]["pred"].append(float(pred)) + tasks[task]["target"].append(float(target)) + except ValueError: + pass + + # For each task with non-zero pred/targets, compute Pearson-R + lines = [] + row = ["Task", "Correlation", "P-value"] + lines.append(",".join(row) + "\n") + for task in tasks: + if len(tasks[task]["pred"]): + corr, pval = pearsonr(tasks[task]["pred"], tasks[task]["target"]) + row = [task, str(round(corr, 3)), str(pval)[:5]] + lines.append(" ".join(row).rjust(20, " ") + "\n") + lines.append(",".join(row) + "\n") + + with open(dataset + "_results.txt", "w") as f: + f.writelines(lines) From 8f0c6dda334f0d427d399e8a450943624be8d41d Mon Sep 17 00:00:00 2001 From: Natasha Seelam Date: Sun, 5 Jun 2022 00:00:33 -0400 Subject: [PATCH 10/10] documents instructions on how to get things working --- instructions.md | 162 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 instructions.md diff --git a/instructions.md b/instructions.md new file mode 100644 index 0000000000..72749df0bc --- /dev/null +++ b/instructions.md @@ -0,0 +1,162 @@ +## Setting up BioPrompting +**2022.06.03** + +In order to get a fresh install, I did the following: + +### Create a fresh conda environment + +I created a new conda environment with the following commands: + +``` +conda create -n bbprompt python=3.9 +conda activate bbprompt +``` + + +### Install the bigbio fork of promptsource + +I installed promptsource as such: + +``` +git clone https://github.com/OpenBioLink/promptsource +cd promptsource +pip install -e . +cd ../ +``` + +You may want to fork your own version and install the remote fork. + + +### Install LM-Eval + +Install specifically the bigbio version of LM-Eval. You can do so as follows: + +``` +git clone git@github.com:bigscience-workshop/lm-evaluation-harness.git +cd lm-evaluation-harness +git checkout bigbio +git pull origin bigbio +pip install -e . +cd .. +``` + +### Install the most recent BigBio dataloaders dataset + +Install the main branch of bigbio: + +``` +git clone git@github.com:bigscience-workshop/biomedical.git +cd biomedical +pip install -e . +cd .. +``` + +### Creating a custom prompt + +**Make sure that in your promptsource installation, a corresponding template exists!** + +For this template to exist, you will find them here: `promptsource/promptsource/templates`. The file itself should be called `templates.yaml` and should be generated via the streamlit app (make sure protobuf <= 3.20.X). The folder the template should be in the following structure: + +`promptsource/templates/your_dataset_name/your_dataset_name_bigbio_schema + +where `your_dataset_name` and `schema` are replaced to the name of the dataset and the specific config you wish to use. + +##### Create a new task + +Create a new task with the following format filled in: + +Note, you will not get results if your data does not have a validation + test set. A crappy hack is to return validation_docs and/or test_docs as the train set. + +Place a file `yourdataset.py` in `lm-evaluation-harness/lm_eval/tasks` that fills out the criteria below: + +```python +from lm_eval.base import BioTask + +_CITATION = """ +PLACE_YOUR_CITATION_HERE +""" + + +class YourDatasetBase(BioTask): + VERSION = 0 + DATASET_PATH = "path/to/dataloader/script/from/bigbio" + DATASET_NAME = None + SPLIT = None + + # Fill these out as T/F depending on your dataset + def has_training_docs(self): + return True + + def has_validation_docs(self): + return True + + def has_test_docs(self): + return True + + def training_docs(self): + if self.has_training_docs(): + return self.dataset["train"] + + def validation_docs(self): + if self.has_validation_docs(): + return self.dataset["validation"] + + def test_docs(self): + if self.has_test_docs(): + return self.dataset["test"] # you can replace with `train` to hack around + + +class YourDatasetSplit(YourDatasetBase): + DATASET_NAME = "yourdataset_bigbio_" +``` + +Add this dataset task to `lm-evaluation-harness/lm_eval/tasks/__init__.py` by adding the following lines: + +```python +from . import yourdataset # Place this in the beginning import + +# Within TASK_REGISTRY, add the following command +TASK_REGISTRY = { + ... + "your_dataset_name": yourdataset.Class_Corresponding_To_Schema +} +``` + +(For example, BIOSSES would look as such:) +```python + "biosses": biosses.BiossesPairs +``` + +### Getting your task to run + +In order to get lm-eval to run, I made the following changes to `lm-evaluation-harness/lm_eval/base.py`. + +1) [L20](https://github.com/bigscience-workshop/lm-evaluation-harness/blob/ea1afe62423c4ff75d6579ccc6942ad8b5138298/lm_eval/base.py#L720): change `target = self.doc_to_target(doc)` to `target = [self.doc_to_target(doc)]` This seemed to give me an issue that returned on str as opposed to a List of str. +
+2) [L1055, L1056](https://github.com/bigscience-workshop/lm-evaluation-harness/blob/ea1afe62423c4ff75d6579ccc6942ad8b5138298/lm_eval/base.py#L1055): Edit `CONFIGURED_RANKED_CHOICE_PS_METRICS` or `CONFIGURED_GENERATION_PS_METRICS` to include your custom task metric (found in your `template.yaml` file prompts under `metrics`.) + +### Implementing a Custom Metric + +In cases where you may need to implement a custom metric, you will need to write a custom function in `lm-evaluation-harness/lm_eval/metrics.py`. More advanced implementations can exist in `lm-evaluation-harness/lm_eval/metrics_impls`. + +**NOTE** If you are working with numerical information (I.e. correlation etc) make sure you have answer choices. If your answer choices are `null` in your prompt, you will go into the "generation" part of the code which may not be useful. + +Next, ensure your task has an output in `aggregation` from in `lm-evaluation-harness/lm_eval/base.py`. +In `lm-evaluation-harness/lm_eval/evaluator.py`, the actual eval code is executed. Changes can be made around 265 to change your logic. + +# Running your Task + +If you implemented the above successfully, your command should run as follows: +``` +python main.py --model hf-seq2seq --model_args pretrained=t5-small --tasks yourdataset --device cpu +``` + +**If you want to run GAD's BLURB set, try:**
+``` +python main.py --model hf-seq2seq --model_args pretrained=t5-small --tasks gad --device cpu +``` + +**If you want to run BIOSSES's BLURB set, try:**
+``` +python main.py --model hf-seq2seq --model_args pretrained=t5-small --tasks biosses --device cpu +```