bigscience-workshop · hakunanatasha · Jun 5, 2022 · Jun 5, 2022 · Jun 5, 2022 · Jun 5, 2022
@@ -0,0 +1,62 @@
+"""Hacky script to get pearson's correlation
+2022.06.04
+"""
+import os
+from pathlib import Path
+from scipy.stats import pearsonr
+import json
+
+if __name__ == "__main__":
+
+    # path to lm-eval and outputs
+    dpath = "/home/natasha/Projects/hfbiomed/full_prompting_pipeline/lm-evaluation-harness/outputs"
+    dataset = "biosses"
+
+    # Get all files with a dataset
+    fnames = Path(dpath).glob("*" + dataset + "*")
+    fnames = [i for i in fnames if "examples-" in i.__str__()]
+
+    # Get the latest result 
+    latest_result = sorted(fnames, key=os.path.getmtime)[-1]
+
+    with open(latest_result, 'r') as json_file:
+        json_list = list(json_file)
+
+
+    # Update only the tasks with float-type predictions
+    tasks = {}
+    for json_str in json_list:
+        result = json.loads(json_str)
+
+        task = result.get("prompt_name", None)
+
+        if result.get("prompt_name", None) is not None:
+
+            if task not in tasks:
+                tasks.update(
+                    {task: {"pred": [], "target": []} }
+                    )
+
+            pred = result.get("pred", None)
+            target = result.get("target", None)
+
+            if (pred is not None) and (target is not None):
+                try:
+                    tasks[task]["pred"].append(float(pred))
+                    tasks[task]["target"].append(float(target))
+                except ValueError:
+                    pass
+
+    # For each task with non-zero pred/targets, compute Pearson-R
+    lines = []
+    row = ["Task", "Correlation", "P-value"]
+    lines.append(",".join(row) + "\n")
+    for task in tasks:
+        if len(tasks[task]["pred"]):
+            corr, pval = pearsonr(tasks[task]["pred"], tasks[task]["target"])
+            row = [task, str(round(corr, 3)), str(pval)[:5]]
+            lines.append(" ".join(row).rjust(20, " ") + "\n")
+            lines.append(",".join(row) + "\n")
+
+    with open(dataset + "_results.txt", "w") as f:
+        f.writelines(lines)
@@ -0,0 +1,162 @@
+## Setting up BioPrompting
+**2022.06.03**
+
+In order to get a fresh install, I did the following:
+
+### Create a fresh conda environment
+
+I created a new conda environment with the following commands:
+
+```
+conda create -n bbprompt python=3.9
+conda activate bbprompt
+```
+
+
+### Install the bigbio fork of promptsource
+
+I installed promptsource as such:
+
+```
+git clone https://github.com/OpenBioLink/promptsource
+cd promptsource
+pip install -e .
+cd ../
+```
+
+You may want to fork your own version and install the remote fork.
+
+
+### Install LM-Eval
+
+Install specifically the bigbio version of LM-Eval. You can do so as follows:
+
+```
+git clone [email protected]:bigscience-workshop/lm-evaluation-harness.git
+cd lm-evaluation-harness
+git checkout bigbio
+git pull origin bigbio
+pip install -e .
+cd ..
+```
+
+### Install the most recent BigBio dataloaders dataset
+
+Install the main branch of bigbio:
+
+```
+git clone [email protected]:bigscience-workshop/biomedical.git
+cd biomedical
+pip install -e .
+cd ..
+```
+
+### Creating a custom prompt
+
+**Make sure that in your promptsource installation, a corresponding template exists!**
+
+For this template to exist, you will find them here: `promptsource/promptsource/templates`. The file itself should be called `templates.yaml` and should be generated via the streamlit app (make sure protobuf <= 3.20.X). The folder the template should be in the following structure:
+
+`promptsource/templates/your_dataset_name/your_dataset_name_bigbio_schema
+
+where `your_dataset_name` and `schema` are replaced to the name of the dataset and the specific config you wish to use.
+
+##### Create a new task
+
+Create a new task with the following format filled in:
+
+Note, you will not get results if your data does not have a validation + test set. A crappy hack is to return validation_docs and/or test_docs as the train set. 
+
+Place a file `yourdataset.py` in `lm-evaluation-harness/lm_eval/tasks` that fills out the criteria below:
+
+```python
+from lm_eval.base import BioTask
+
+_CITATION = """
+PLACE_YOUR_CITATION_HERE
+"""
+
+
+class YourDatasetBase(BioTask):
+    VERSION = 0
+    DATASET_PATH = "path/to/dataloader/script/from/bigbio"
+    DATASET_NAME = None
+    SPLIT = None
+
+    # Fill these out as T/F depending on your dataset
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        if self.has_training_docs():
+            return self.dataset["train"]
+
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.dataset["validation"]
+
+    def test_docs(self):
+        if self.has_test_docs():
+            return self.dataset["test"]  # you can replace with `train` to hack around
+
+
+class YourDatasetSplit(YourDatasetBase):
+    DATASET_NAME = "yourdataset_bigbio_<schema>"
+```
+
+Add this dataset task to `lm-evaluation-harness/lm_eval/tasks/__init__.py` by adding the following lines:
+
+```python
+from . import yourdataset  # Place this in the beginning import
+
+# Within TASK_REGISTRY, add the following command
+TASK_REGISTRY = {
+    ...
+    "your_dataset_name": yourdataset.Class_Corresponding_To_Schema
+}
+```
+
+(For example, BIOSSES would look as such:)
+```python
+    "biosses": biosses.BiossesPairs
+```
+
+### Getting your task to run
+
+In order to get lm-eval to run, I made the following changes to `lm-evaluation-harness/lm_eval/base.py`.
+
+1) [L20](https://github.com/bigscience-workshop/lm-evaluation-harness/blob/ea1afe62423c4ff75d6579ccc6942ad8b5138298/lm_eval/base.py#L720): change `target = self.doc_to_target(doc)` to `target = [self.doc_to_target(doc)]` This seemed to give me an issue that returned on str as opposed to a List of str.
+<br>
+2) [L1055, L1056](https://github.com/bigscience-workshop/lm-evaluation-harness/blob/ea1afe62423c4ff75d6579ccc6942ad8b5138298/lm_eval/base.py#L1055): Edit `CONFIGURED_RANKED_CHOICE_PS_METRICS` or `CONFIGURED_GENERATION_PS_METRICS` to include your custom task metric (found in your `template.yaml` file prompts under `metrics`.) 
+
+### Implementing a Custom Metric
+
+In cases where you may need to implement a custom metric, you will need to write a custom function in `lm-evaluation-harness/lm_eval/metrics.py`. More advanced implementations can exist in `lm-evaluation-harness/lm_eval/metrics_impls`.
+
+**NOTE** If you are working with numerical information (I.e. correlation etc) make sure you have answer choices. If your answer choices are `null` in your prompt, you will go into the "generation" part of the code which may not be useful.
+
+Next, ensure your task has an output in `aggregation` from in `lm-evaluation-harness/lm_eval/base.py`.
+In `lm-evaluation-harness/lm_eval/evaluator.py`, the actual eval code is executed. Changes can be made around 265 to change your logic.
+
+# Running your Task
+
+If you implemented the above successfully, your command should run as follows:
+```
+python main.py --model hf-seq2seq --model_args pretrained=t5-small --tasks yourdataset --device cpu
+```
+
+**If you want to run GAD's BLURB set, try:**<br>
+```
+python main.py --model hf-seq2seq --model_args pretrained=t5-small --tasks gad --device cpu
+```
+
+**If you want to run BIOSSES's BLURB set, try:**<br>
+```
+python main.py --model hf-seq2seq --model_args pretrained=t5-small --tasks biosses --device cpu
+```
@@ -717,7 +717,7 @@ def process_results(self, doc, results):
             The results of the requests created in construct_requests.
         """
         answer_choices_list = self.prompt.get_answer_choices_list(doc)
-        target = self.doc_to_target(doc)
+        target = [self.doc_to_target(doc)]  # N.SEELAM The target is a str, not a list of strs
         if answer_choices_list:
             # If answer_choices_list, then this is a ranked choice prompt.
             # NOTE: In the future, target could be a list of strings.
@@ -1052,7 +1052,7 @@ class BioTask(PromptSourceTask):
     *and* add additional custom processing, override `process_results`, `higher_is_better`, and `aggregation`.
     """
 
-    CONFIGURED_RANKED_CHOICE_PS_METRICS = set(["Accuracy"])
+    CONFIGURED_RANKED_CHOICE_PS_METRICS = set(["Accuracy", "Other"])
     CONFIGURED_GENERATION_PS_METRICS = set(["BLEU", "ROUGE", "SARI"])
     SPLIT = None
 

@@ -27,7 +27,8 @@
 from . import wmt
 from . import cnn_dailymail
 from . import diabla
-
+from . import gad
+from . import biosses
 
 ########################################
 # All tasks
@@ -199,6 +200,23 @@
 
     # SciTail
     "scitail": scitail.SciTailTE,
+
+    # All GAD Datasets
+    "gad0": gad.GadFold0Text,
+    "gad1": gad.GadFold1Text,
+    "gad2": gad.GadFold2Text,
+    "gad3": gad.GadFold3Text,
+    "gad4": gad.GadFold4Text,
+    "gad5": gad.GadFold5Text,
+    "gad6": gad.GadFold6Text,
+    "gad7": gad.GadFold7Text,
+    "gad8": gad.GadFold8Text,
+    "gad9": gad.GadFold9Text,
+    "gad": gad.GadBlurbText,
+
+    # Biosses
+    "biosses": biosses.BiossesPairs,
+
 }
 
 

@@ -0,0 +1,50 @@
+"""
+BIOSSES: 
+NOTE: A stupid hack I am doing is forcing a "validation" set as this only has training data implemented.
+"""
+from lm_eval.base import BioTask
+
+_CITATION = """
+@article{souganciouglu2017biosses,
+  title={BIOSSES: a semantic sentence similarity estimation system for the biomedical domain},
+  author={Soğancıoğlu, Gizem, Hakime Öztürk, and Arzucan Özgür},
+  journal={Bioinformatics},
+  volume={33},
+  number={14},
+  pages={i49--i58},
+  year={2017},
+  publisher={Oxford University Press}
+}
+"""
+
+
+class BiossesBase(BioTask):
+    VERSION = 0
+    DATASET_PATH = "/home/natasha/Projects/hfbiomed/full_prompting_pipeline/biomedical/bigbio/biodatasets/biosses"
+    DATASET_NAME = None
+    SPLIT = None
+
+    def has_training_docs(self):
+        return True
+
+    def has_validation_docs(self):
+        return True
+
+    def has_test_docs(self):
+        return True
+
+    def training_docs(self):
+        if self.has_training_docs():
+            return self.dataset["train"]
+
+    def validation_docs(self):
+        if self.has_validation_docs():
+            return self.dataset["validation"]
+
+    def test_docs(self):
+        if self.has_test_docs():
+            return self.dataset["test"]
+
+
+class BiossesPairs(BiossesBase):
+    DATASET_NAME = "biosses_bigbio_pairs"