Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions get_pearson_corr_biosses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""Hacky script to get pearson's correlation
2022.06.04
"""
import os
from pathlib import Path
from scipy.stats import pearsonr
import json

if __name__ == "__main__":

# path to lm-eval and outputs
dpath = "/home/natasha/Projects/hfbiomed/full_prompting_pipeline/lm-evaluation-harness/outputs"
dataset = "biosses"

# Get all files with a dataset
fnames = Path(dpath).glob("*" + dataset + "*")
fnames = [i for i in fnames if "examples-" in i.__str__()]

# Get the latest result
latest_result = sorted(fnames, key=os.path.getmtime)[-1]

with open(latest_result, 'r') as json_file:
json_list = list(json_file)


# Update only the tasks with float-type predictions
tasks = {}
for json_str in json_list:
result = json.loads(json_str)

task = result.get("prompt_name", None)

if result.get("prompt_name", None) is not None:

if task not in tasks:
tasks.update(
{task: {"pred": [], "target": []} }
)

pred = result.get("pred", None)
target = result.get("target", None)

if (pred is not None) and (target is not None):
try:
tasks[task]["pred"].append(float(pred))
tasks[task]["target"].append(float(target))
except ValueError:
pass

# For each task with non-zero pred/targets, compute Pearson-R
lines = []
row = ["Task", "Correlation", "P-value"]
lines.append(",".join(row) + "\n")
for task in tasks:
if len(tasks[task]["pred"]):
corr, pval = pearsonr(tasks[task]["pred"], tasks[task]["target"])
row = [task, str(round(corr, 3)), str(pval)[:5]]
lines.append(" ".join(row).rjust(20, " ") + "\n")
lines.append(",".join(row) + "\n")

with open(dataset + "_results.txt", "w") as f:
f.writelines(lines)
162 changes: 162 additions & 0 deletions instructions.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
## Setting up BioPrompting
**2022.06.03**

In order to get a fresh install, I did the following:

### Create a fresh conda environment

I created a new conda environment with the following commands:

```
conda create -n bbprompt python=3.9
conda activate bbprompt
```


### Install the bigbio fork of promptsource

I installed promptsource as such:

```
git clone https://github.com/OpenBioLink/promptsource
cd promptsource
pip install -e .
cd ../
```

You may want to fork your own version and install the remote fork.


### Install LM-Eval

Install specifically the bigbio version of LM-Eval. You can do so as follows:

```
git clone [email protected]:bigscience-workshop/lm-evaluation-harness.git
cd lm-evaluation-harness
git checkout bigbio
git pull origin bigbio
pip install -e .
cd ..
```

### Install the most recent BigBio dataloaders dataset

Install the main branch of bigbio:

```
git clone [email protected]:bigscience-workshop/biomedical.git
cd biomedical
pip install -e .
cd ..
```

### Creating a custom prompt

**Make sure that in your promptsource installation, a corresponding template exists!**

For this template to exist, you will find them here: `promptsource/promptsource/templates`. The file itself should be called `templates.yaml` and should be generated via the streamlit app (make sure protobuf <= 3.20.X). The folder the template should be in the following structure:

`promptsource/templates/your_dataset_name/your_dataset_name_bigbio_schema

where `your_dataset_name` and `schema` are replaced to the name of the dataset and the specific config you wish to use.

##### Create a new task

Create a new task with the following format filled in:

Note, you will not get results if your data does not have a validation + test set. A crappy hack is to return validation_docs and/or test_docs as the train set.

Place a file `yourdataset.py` in `lm-evaluation-harness/lm_eval/tasks` that fills out the criteria below:

```python
from lm_eval.base import BioTask

_CITATION = """
PLACE_YOUR_CITATION_HERE
"""


class YourDatasetBase(BioTask):
VERSION = 0
DATASET_PATH = "path/to/dataloader/script/from/bigbio"
DATASET_NAME = None
SPLIT = None

# Fill these out as T/F depending on your dataset
def has_training_docs(self):
return True

def has_validation_docs(self):
return True

def has_test_docs(self):
return True

def training_docs(self):
if self.has_training_docs():
return self.dataset["train"]

def validation_docs(self):
if self.has_validation_docs():
return self.dataset["validation"]

def test_docs(self):
if self.has_test_docs():
return self.dataset["test"] # you can replace with `train` to hack around


class YourDatasetSplit(YourDatasetBase):
DATASET_NAME = "yourdataset_bigbio_<schema>"
```

Add this dataset task to `lm-evaluation-harness/lm_eval/tasks/__init__.py` by adding the following lines:

```python
from . import yourdataset # Place this in the beginning import

# Within TASK_REGISTRY, add the following command
TASK_REGISTRY = {
...
"your_dataset_name": yourdataset.Class_Corresponding_To_Schema
}
```

(For example, BIOSSES would look as such:)
```python
"biosses": biosses.BiossesPairs
```

### Getting your task to run

In order to get lm-eval to run, I made the following changes to `lm-evaluation-harness/lm_eval/base.py`.

1) [L20](https://github.com/bigscience-workshop/lm-evaluation-harness/blob/ea1afe62423c4ff75d6579ccc6942ad8b5138298/lm_eval/base.py#L720): change `target = self.doc_to_target(doc)` to `target = [self.doc_to_target(doc)]` This seemed to give me an issue that returned on str as opposed to a List of str.
<br>
2) [L1055, L1056](https://github.com/bigscience-workshop/lm-evaluation-harness/blob/ea1afe62423c4ff75d6579ccc6942ad8b5138298/lm_eval/base.py#L1055): Edit `CONFIGURED_RANKED_CHOICE_PS_METRICS` or `CONFIGURED_GENERATION_PS_METRICS` to include your custom task metric (found in your `template.yaml` file prompts under `metrics`.)

### Implementing a Custom Metric

In cases where you may need to implement a custom metric, you will need to write a custom function in `lm-evaluation-harness/lm_eval/metrics.py`. More advanced implementations can exist in `lm-evaluation-harness/lm_eval/metrics_impls`.

**NOTE** If you are working with numerical information (I.e. correlation etc) make sure you have answer choices. If your answer choices are `null` in your prompt, you will go into the "generation" part of the code which may not be useful.

Next, ensure your task has an output in `aggregation` from in `lm-evaluation-harness/lm_eval/base.py`.
In `lm-evaluation-harness/lm_eval/evaluator.py`, the actual eval code is executed. Changes can be made around 265 to change your logic.

# Running your Task

If you implemented the above successfully, your command should run as follows:
```
python main.py --model hf-seq2seq --model_args pretrained=t5-small --tasks yourdataset --device cpu
```

**If you want to run GAD's BLURB set, try:**<br>
```
python main.py --model hf-seq2seq --model_args pretrained=t5-small --tasks gad --device cpu
```

**If you want to run BIOSSES's BLURB set, try:**<br>
```
python main.py --model hf-seq2seq --model_args pretrained=t5-small --tasks biosses --device cpu
```
4 changes: 2 additions & 2 deletions lm_eval/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -717,7 +717,7 @@ def process_results(self, doc, results):
The results of the requests created in construct_requests.
"""
answer_choices_list = self.prompt.get_answer_choices_list(doc)
target = self.doc_to_target(doc)
target = [self.doc_to_target(doc)] # N.SEELAM The target is a str, not a list of strs
if answer_choices_list:
# If answer_choices_list, then this is a ranked choice prompt.
# NOTE: In the future, target could be a list of strings.
Expand Down Expand Up @@ -1052,7 +1052,7 @@ class BioTask(PromptSourceTask):
*and* add additional custom processing, override `process_results`, `higher_is_better`, and `aggregation`.
"""

CONFIGURED_RANKED_CHOICE_PS_METRICS = set(["Accuracy"])
CONFIGURED_RANKED_CHOICE_PS_METRICS = set(["Accuracy", "Other"])
CONFIGURED_GENERATION_PS_METRICS = set(["BLEU", "ROUGE", "SARI"])
SPLIT = None

Expand Down
20 changes: 19 additions & 1 deletion lm_eval/tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
from . import wmt
from . import cnn_dailymail
from . import diabla

from . import gad
from . import biosses

########################################
# All tasks
Expand Down Expand Up @@ -199,6 +200,23 @@

# SciTail
"scitail": scitail.SciTailTE,

# All GAD Datasets
"gad0": gad.GadFold0Text,
"gad1": gad.GadFold1Text,
"gad2": gad.GadFold2Text,
"gad3": gad.GadFold3Text,
"gad4": gad.GadFold4Text,
"gad5": gad.GadFold5Text,
"gad6": gad.GadFold6Text,
"gad7": gad.GadFold7Text,
"gad8": gad.GadFold8Text,
"gad9": gad.GadFold9Text,
"gad": gad.GadBlurbText,

# Biosses
"biosses": biosses.BiossesPairs,

}


Expand Down
50 changes: 50 additions & 0 deletions lm_eval/tasks/biosses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""
BIOSSES:
NOTE: A stupid hack I am doing is forcing a "validation" set as this only has training data implemented.
"""
from lm_eval.base import BioTask

_CITATION = """
@article{souganciouglu2017biosses,
title={BIOSSES: a semantic sentence similarity estimation system for the biomedical domain},
author={Soğancıoğlu, Gizem, Hakime Öztürk, and Arzucan Özgür},
journal={Bioinformatics},
volume={33},
number={14},
pages={i49--i58},
year={2017},
publisher={Oxford University Press}
}
"""


class BiossesBase(BioTask):
VERSION = 0
DATASET_PATH = "/home/natasha/Projects/hfbiomed/full_prompting_pipeline/biomedical/bigbio/biodatasets/biosses"
DATASET_NAME = None
SPLIT = None

def has_training_docs(self):
return True

def has_validation_docs(self):
return True

def has_test_docs(self):
return True

def training_docs(self):
if self.has_training_docs():
return self.dataset["train"]

def validation_docs(self):
if self.has_validation_docs():
return self.dataset["validation"]

def test_docs(self):
if self.has_test_docs():
return self.dataset["test"]


class BiossesPairs(BiossesBase):
DATASET_NAME = "biosses_bigbio_pairs"
Loading