From e7962d432b055431fc5f29510af8667fabee8070 Mon Sep 17 00:00:00 2001
From: Natasha Seelam <nseelam1@gmail.com>
Date: Sat, 4 Jun 2022 22:16:21 -0400
Subject: [PATCH 01/10] fix GAD splits

---
 lm_eval/tasks/gad.py | 79 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 lm_eval/tasks/gad.py

diff --git a/lm_eval/tasks/gad.py b/lm_eval/tasks/gad.py
new file mode 100644
index 0000000000..c65c4c280a
--- /dev/null
+++ b/lm_eval/tasks/gad.py
@@ -0,0 +1,79 @@
+"""
+Gad: A corpus identifying associations between genes and diseases by a semi-automatic annotation procedure based on the Genetic Association Database
+Homepage: "https://github.com/dmis-lab/biobert"
+"""
+from lm_eval.base import BioTask
+
+_CITATION = """
+@article{Bravo2015,
+  doi = {10.1186/s12859-015-0472-9},
+  url = {https://doi.org/10.1186/s12859-015-0472-9},
+  year = {2015},
+  month = feb,
+  publisher = {Springer Science and Business Media {LLC}},
+  volume = {16},
+  number = {1},
+  author = {{\`{A}}lex Bravo and Janet Pi{\~{n}}ero and N{\'{u}}ria Queralt-Rosinach and Michael Rautschka and Laura I Furlong},
+  title = {Extraction of relations between genes and diseases from text and large-scale data analysis: implications for translational research},
+  journal = {{BMC} Bioinformatics}
+}
+"""
+
+
+class GadBase(BioTask):
+    VERSION = 0
+    DATASET_PATH = "/home/natasha/Projects/hfbiomed/blurb_datasets/gad"
+    DATASET_NAME = None
+    SPLIT = None
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        if self.has_training_docs():
+            return self.dataset["train"]
+
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.dataset["validation"]
+
+    def test_docs(self):
+        if self.has_test_docs():
+            return self.dataset["test"]
+
+
+class GadFold0Text(GadBase):
+    DATASET_NAME = "gad_fold0_bigbio_text"
+
+class GadFold1Text(GadBase):
+    DATASET_NAME = "gad_fold1_bigbio_text"
+
+class GadFold2Text(GadBase):
+    DATASET_NAME = "gad_fold2_bigbio_text"
+
+class GadFold3Text(GadBase):
+    DATASET_NAME = "gad_fold3_bigbio_text"
+
+class GadFold4Text(GadBase):
+    DATASET_NAME = "gad_fold4_bigbio_text"
+
+class GadFold5Text(GadBase):
+    DATASET_NAME = "gad_fold5_bigbio_text"
+
+class GadFold6Text(GadBase):
+    DATASET_NAME = "gad_fold6_bigbio_text"
+
+class GadFold7Text(GadBase):
+    DATASET_NAME = "gad_fold7_bigbio_text"
+
+class GadFold8Text(GadBase):
+    DATASET_NAME = "gad_fold8_bigbio_text"
+
+class GadFold9Text(GadBase):
+    DATASET_NAME = "gad_fold9_bigbio_text"
\ No newline at end of file

From f67980f3f01fa0ef292d44b69ed28f6b94c9ea04 Mon Sep 17 00:00:00 2001
From: Natasha Seelam <nseelam1@gmail.com>
Date: Sat, 4 Jun 2022 22:21:42 -0400
Subject: [PATCH 02/10] add all gad splits

---
 lm_eval/tasks/__init__.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index a8fc9c31dc..46ead26713 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -27,7 +27,7 @@
 from . import wmt
 from . import cnn_dailymail
 from . import diabla
-
+from . import gad
 
 ########################################
 # All tasks
@@ -199,6 +199,20 @@
     
     # SciTail
     "scitail": scitail.SciTailTE,
+
+    # All GAD Datasets
+    "gad0": gad.GadFold0Text,
+    "gad1": gad.GadFold1Text,
+    "gad2": gad.GadFold2Text,
+    "gad3": gad.GadFold3Text,
+    "gad4": gad.GadFold4Text,
+    "gad5": gad.GadFold5Text,
+    "gad6": gad.GadFold6Text,
+    "gad7": gad.GadFold7Text,
+    "gad8": gad.GadFold8Text,
+    "gad9": gad.GadFold9Text,
+
+
 }
 
 

From 11d36c0a6bcc3e1c785483e4ee8a0d9cef81b49c Mon Sep 17 00:00:00 2001
From: Natasha Seelam <nseelam1@gmail.com>
Date: Sat, 4 Jun 2022 22:23:04 -0400
Subject: [PATCH 03/10] fix: 720 target should be a list of str not str only

---
 lm_eval/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lm_eval/base.py b/lm_eval/base.py
index 55db8af543..ba7ffbcdab 100644
--- a/lm_eval/base.py
+++ b/lm_eval/base.py
@@ -717,7 +717,7 @@ def process_results(self, doc, results):
             The results of the requests created in construct_requests.
         """
         answer_choices_list = self.prompt.get_answer_choices_list(doc)
-        target = self.doc_to_target(doc)
+        target = [self.doc_to_target(doc)]  # N.SEELAM The target is a str, not a list of strs
         if answer_choices_list:
             # If answer_choices_list, then this is a ranked choice prompt.
             # NOTE: In the future, target could be a list of strings.
@@ -1052,7 +1052,7 @@ class BioTask(PromptSourceTask):
     *and* add additional custom processing, override `process_results`, `higher_is_better`, and `aggregation`.
     """
 
-    CONFIGURED_RANKED_CHOICE_PS_METRICS = set(["Accuracy"])
+    CONFIGURED_RANKED_CHOICE_PS_METRICS = set(["Accuracy", "Other"])
     CONFIGURED_GENERATION_PS_METRICS = set(["BLEU", "ROUGE", "SARI"])
     SPLIT = None
 

From 2f5c507cf8fcea5783d316fac38f8e84083c55ef Mon Sep 17 00:00:00 2001
From: Natasha Seelam <nseelam1@gmail.com>
Date: Sat, 4 Jun 2022 23:19:15 -0400
Subject: [PATCH 04/10] adds gadblurb task

---
 lm_eval/tasks/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index 46ead26713..206e8270f9 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -211,6 +211,9 @@
     "gad7": gad.GadFold7Text,
     "gad8": gad.GadFold8Text,
     "gad9": gad.GadFold9Text,
+    "gadblurb": gad.GadBlurbText,
+
+    # Biosses
 
 
 }

From 30e5e0b265b349c5df8edfdc341cf528b4326095 Mon Sep 17 00:00:00 2001
From: Natasha Seelam <nseelam1@gmail.com>
Date: Sat, 4 Jun 2022 23:19:26 -0400
Subject: [PATCH 05/10] adds gadblurb task

---
 lm_eval/tasks/gad.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lm_eval/tasks/gad.py b/lm_eval/tasks/gad.py
index c65c4c280a..f1f3ea5161 100644
--- a/lm_eval/tasks/gad.py
+++ b/lm_eval/tasks/gad.py
@@ -22,7 +22,7 @@
 
 class GadBase(BioTask):
     VERSION = 0
-    DATASET_PATH = "/home/natasha/Projects/hfbiomed/blurb_datasets/gad"
+    DATASET_PATH = "/home/natasha/Projects/hfbiomed/full_prompting_pipeline/biomedical/bigbio/biodatasets/gad"
     DATASET_NAME = None
     SPLIT = None
 
@@ -47,6 +47,9 @@ def test_docs(self):
         if self.has_test_docs():
             return self.dataset["test"]
 
+class GadBlurbText(GadBase):
+    """BLURB split from GAD, based on fold1"""
+    DATASET_NAME = "gad_blurb_bigbio_text"
 
 class GadFold0Text(GadBase):
     DATASET_NAME = "gad_fold0_bigbio_text"

From 96c840e4423261eab6f4f337215022b48abb5289 Mon Sep 17 00:00:00 2001
From: Natasha Seelam <nseelam1@gmail.com>
Date: Sat, 4 Jun 2022 23:20:25 -0400
Subject: [PATCH 06/10] change default name to gad

---
 lm_eval/tasks/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index 206e8270f9..f2e74e4823 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -211,7 +211,7 @@
     "gad7": gad.GadFold7Text,
     "gad8": gad.GadFold8Text,
     "gad9": gad.GadFold9Text,
-    "gadblurb": gad.GadBlurbText,
+    "gad": gad.GadBlurbText,
 
     # Biosses
 

From 19d38363329355833487f14c982dbd49aeb8525b Mon Sep 17 00:00:00 2001
From: Natasha Seelam <nseelam1@gmail.com>
Date: Sat, 4 Jun 2022 23:22:56 -0400
Subject: [PATCH 07/10] adds biosses tasks

---
 lm_eval/tasks/biosses.py | 50 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 lm_eval/tasks/biosses.py

diff --git a/lm_eval/tasks/biosses.py b/lm_eval/tasks/biosses.py
new file mode 100644
index 0000000000..6e434dc743
--- /dev/null
+++ b/lm_eval/tasks/biosses.py
@@ -0,0 +1,50 @@
+"""
+BIOSSES: 
+NOTE: A stupid hack I am doing is forcing a "validation" set as this only has training data implemented.
+"""
+from lm_eval.base import BioTask
+
+_CITATION = """
+@article{souganciouglu2017biosses,
+  title={BIOSSES: a semantic sentence similarity estimation system for the biomedical domain},
+  author={Soğancıoğlu, Gizem, Hakime Öztürk, and Arzucan Özgür},
+  journal={Bioinformatics},
+  volume={33},
+  number={14},
+  pages={i49--i58},
+  year={2017},
+  publisher={Oxford University Press}
+}
+"""
+
+
+class BiossesBase(BioTask):
+    VERSION = 0
+    DATASET_PATH = "/home/natasha/Projects/hfbiomed/full_prompting_pipeline/biomedical/bigbio/biodatasets/biosses"
+    DATASET_NAME = None
+    SPLIT = None
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        if self.has_training_docs():
+            return self.dataset["train"]
+
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.dataset["validation"]
+
+    def test_docs(self):
+        if self.has_test_docs():
+            return self.dataset["test"]
+
+
+class BiossesPairs(BiossesBase):
+    DATASET_NAME = "biosses_bigbio_pairs"
\ No newline at end of file

From 06e8d4307db143a21d00b0f9b6f32cb10b856326 Mon Sep 17 00:00:00 2001
From: Natasha Seelam <nseelam1@gmail.com>
Date: Sat, 4 Jun 2022 23:24:16 -0400
Subject: [PATCH 08/10] adds biosses task and init

---
 lm_eval/tasks/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index f2e74e4823..03ecbb0f6a 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -28,6 +28,7 @@
 from . import cnn_dailymail
 from . import diabla
 from . import gad
+from . import biosses
 
 ########################################
 # All tasks
@@ -214,7 +215,7 @@
     "gad": gad.GadBlurbText,
 
     # Biosses
-
+    "biosses": biosses.BiossesPairs,
 
 }
 

From 042ef9c42a63191bcd84306cc1605ab760e72b9d Mon Sep 17 00:00:00 2001
From: Natasha Seelam <nseelam1@gmail.com>
Date: Sun, 5 Jun 2022 00:00:05 -0400
Subject: [PATCH 09/10] adds hacky script for correlation

---
 get_pearson_corr_biosses.py | 62 +++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 get_pearson_corr_biosses.py

diff --git a/get_pearson_corr_biosses.py b/get_pearson_corr_biosses.py
new file mode 100644
index 0000000000..2483f82e04
--- /dev/null
+++ b/get_pearson_corr_biosses.py
@@ -0,0 +1,62 @@
+"""Hacky script to get pearson's correlation
+2022.06.04
+"""
+import os
+from pathlib import Path
+from scipy.stats import pearsonr
+import json
+
+if __name__ == "__main__":
+
+    # path to lm-eval and outputs
+    dpath = "/home/natasha/Projects/hfbiomed/full_prompting_pipeline/lm-evaluation-harness/outputs"
+    dataset = "biosses"
+
+    # Get all files with a dataset
+    fnames = Path(dpath).glob("*" + dataset + "*")
+    fnames = [i for i in fnames if "examples-" in i.__str__()]
+
+    # Get the latest result 
+    latest_result = sorted(fnames, key=os.path.getmtime)[-1]
+
+    with open(latest_result, 'r') as json_file:
+        json_list = list(json_file)
+    
+
+    # Update only the tasks with float-type predictions
+    tasks = {}
+    for json_str in json_list:
+        result = json.loads(json_str)
+
+        task = result.get("prompt_name", None)
+
+        if result.get("prompt_name", None) is not None:
+            
+            if task not in tasks:
+                tasks.update(
+                    {task: {"pred": [], "target": []} }
+                    )
+                
+            pred = result.get("pred", None)
+            target = result.get("target", None)
+
+            if (pred is not None) and (target is not None):
+                try:
+                    tasks[task]["pred"].append(float(pred))
+                    tasks[task]["target"].append(float(target))
+                except ValueError:
+                    pass
+    
+    # For each task with non-zero pred/targets, compute Pearson-R
+    lines = []
+    row = ["Task", "Correlation", "P-value"]
+    lines.append(",".join(row) + "\n")
+    for task in tasks:
+        if len(tasks[task]["pred"]):
+            corr, pval = pearsonr(tasks[task]["pred"], tasks[task]["target"])
+            row = [task, str(round(corr, 3)), str(pval)[:5]]
+            lines.append(" ".join(row).rjust(20, " ") + "\n")
+            lines.append(",".join(row) + "\n")
+
+    with open(dataset + "_results.txt", "w") as f:
+        f.writelines(lines)

From 8f0c6dda334f0d427d399e8a450943624be8d41d Mon Sep 17 00:00:00 2001
From: Natasha Seelam <nseelam1@gmail.com>
Date: Sun, 5 Jun 2022 00:00:33 -0400
Subject: [PATCH 10/10] documents instructions on how to get things working

---
 instructions.md | 162 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 162 insertions(+)
 create mode 100644 instructions.md

diff --git a/instructions.md b/instructions.md
new file mode 100644
index 0000000000..72749df0bc
--- /dev/null
+++ b/instructions.md
@@ -0,0 +1,162 @@
+## Setting up BioPrompting
+**2022.06.03**
+
+In order to get a fresh install, I did the following:
+
+### Create a fresh conda environment
+
+I created a new conda environment with the following commands:
+
+```
+conda create -n bbprompt python=3.9
+conda activate bbprompt
+```
+
+
+### Install the bigbio fork of promptsource
+
+I installed promptsource as such:
+
+```
+git clone https://github.com/OpenBioLink/promptsource
+cd promptsource
+pip install -e .
+cd ../
+```
+
+You may want to fork your own version and install the remote fork.
+
+
+### Install LM-Eval
+
+Install specifically the bigbio version of LM-Eval. You can do so as follows:
+
+```
+git clone git@github.com:bigscience-workshop/lm-evaluation-harness.git
+cd lm-evaluation-harness
+git checkout bigbio
+git pull origin bigbio
+pip install -e .
+cd ..
+```
+
+### Install the most recent BigBio dataloaders dataset
+
+Install the main branch of bigbio:
+
+```
+git clone git@github.com:bigscience-workshop/biomedical.git
+cd biomedical
+pip install -e .
+cd ..
+```
+
+### Creating a custom prompt
+
+**Make sure that in your promptsource installation, a corresponding template exists!**
+
+For this template to exist, you will find them here: `promptsource/promptsource/templates`. The file itself should be called `templates.yaml` and should be generated via the streamlit app (make sure protobuf <= 3.20.X). The folder the template should be in the following structure:
+
+`promptsource/templates/your_dataset_name/your_dataset_name_bigbio_schema
+
+where `your_dataset_name` and `schema` are replaced to the name of the dataset and the specific config you wish to use.
+
+##### Create a new task
+
+Create a new task with the following format filled in:
+
+Note, you will not get results if your data does not have a validation + test set. A crappy hack is to return validation_docs and/or test_docs as the train set. 
+
+Place a file `yourdataset.py` in `lm-evaluation-harness/lm_eval/tasks` that fills out the criteria below:
+
+```python
+from lm_eval.base import BioTask
+
+_CITATION = """
+PLACE_YOUR_CITATION_HERE
+"""
+
+
+class YourDatasetBase(BioTask):
+    VERSION = 0
+    DATASET_PATH = "path/to/dataloader/script/from/bigbio"
+    DATASET_NAME = None
+    SPLIT = None
+    
+    # Fill these out as T/F depending on your dataset
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        if self.has_training_docs():
+            return self.dataset["train"]
+
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.dataset["validation"]
+
+    def test_docs(self):
+        if self.has_test_docs():
+            return self.dataset["test"]  # you can replace with `train` to hack around
+
+
+class YourDatasetSplit(YourDatasetBase):
+    DATASET_NAME = "yourdataset_bigbio_<schema>"
+```
+
+Add this dataset task to `lm-evaluation-harness/lm_eval/tasks/__init__.py` by adding the following lines:
+
+```python
+from . import yourdataset  # Place this in the beginning import
+
+# Within TASK_REGISTRY, add the following command
+TASK_REGISTRY = {
+    ...
+    "your_dataset_name": yourdataset.Class_Corresponding_To_Schema
+}
+```
+
+(For example, BIOSSES would look as such:)
+```python
+    "biosses": biosses.BiossesPairs
+```
+
+### Getting your task to run
+
+In order to get lm-eval to run, I made the following changes to `lm-evaluation-harness/lm_eval/base.py`.
+
+1) [L20](https://github.com/bigscience-workshop/lm-evaluation-harness/blob/ea1afe62423c4ff75d6579ccc6942ad8b5138298/lm_eval/base.py#L720): change `target = self.doc_to_target(doc)` to `target = [self.doc_to_target(doc)]` This seemed to give me an issue that returned on str as opposed to a List of str.
+<br>
+2) [L1055, L1056](https://github.com/bigscience-workshop/lm-evaluation-harness/blob/ea1afe62423c4ff75d6579ccc6942ad8b5138298/lm_eval/base.py#L1055): Edit `CONFIGURED_RANKED_CHOICE_PS_METRICS` or `CONFIGURED_GENERATION_PS_METRICS` to include your custom task metric (found in your `template.yaml` file prompts under `metrics`.) 
+
+### Implementing a Custom Metric
+
+In cases where you may need to implement a custom metric, you will need to write a custom function in `lm-evaluation-harness/lm_eval/metrics.py`. More advanced implementations can exist in `lm-evaluation-harness/lm_eval/metrics_impls`.
+
+**NOTE** If you are working with numerical information (I.e. correlation etc) make sure you have answer choices. If your answer choices are `null` in your prompt, you will go into the "generation" part of the code which may not be useful.
+
+Next, ensure your task has an output in `aggregation` from in `lm-evaluation-harness/lm_eval/base.py`.
+In `lm-evaluation-harness/lm_eval/evaluator.py`, the actual eval code is executed. Changes can be made around 265 to change your logic.
+
+# Running your Task
+
+If you implemented the above successfully, your command should run as follows:
+```
+python main.py --model hf-seq2seq --model_args pretrained=t5-small --tasks yourdataset --device cpu
+```
+
+**If you want to run GAD's BLURB set, try:**<br>
+```
+python main.py --model hf-seq2seq --model_args pretrained=t5-small --tasks gad --device cpu
+```
+
+**If you want to run BIOSSES's BLURB set, try:**<br>
+```
+python main.py --model hf-seq2seq --model_args pretrained=t5-small --tasks biosses --device cpu
+```