Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
44f16ee
add annotation tool
niansong1996 Oct 10, 2022
bde0474
10 annotations for squall for stephen
niansong1996 Oct 12, 2022
87c8385
new ui enhancements and testing
Nov 2, 2022
b4ebe45
fixed bug where cancel still saved result, fixed bug where exec_info …
StephenYin01 Nov 3, 2022
5451568
fixed annotation missing an annotation
StephenYin01 Nov 3, 2022
653af0b
new annotations
StephenYin01 Nov 10, 2022
63c2ed6
new annotations again
StephenYin01 Nov 10, 2022
93074da
50 squall instances
MartinRiddell Nov 11, 2022
f54d0c9
finished 50 examples
StephenYin01 Nov 16, 2022
56a7664
fix bug in initing spider executor
niansong1996 Nov 23, 2022
a0d22b6
fix bug in executor
niansong1996 Nov 25, 2022
b09c70a
Merge pull request #16 from Yale-LILY/martin/squall_annotation
niansong1996 Nov 25, 2022
5e6b374
Merge pull request #15 from Yale-LILY/stephen/annotation-dev
niansong1996 Nov 25, 2022
141e214
new annotations for spider (need to revise some with new execution); …
StephenYin01 Dec 1, 2022
7657ae2
fixed annotation examples
StephenYin01 Dec 2, 2022
3bfdc9f
Merge pull request #17 from Yale-LILY/stephen/spider-annotation-dev
niansong1996 Dec 2, 2022
834dfd7
new examples; fixed spider execution criterion bug
StephenYin01 Dec 2, 2022
0457619
debug spider_execution_py if table name is python reserved keyword, m…
StephenYin01 Dec 3, 2022
400b98f
Merge pull request #18 from Yale-LILY/stephen/spider-annotation-dev
niansong1996 Dec 7, 2022
1c35e33
first few annotations for spider
MartinRiddell Dec 8, 2022
05f3437
Finished 50 annotations for spider dataset
MartinRiddell Feb 8, 2023
4aee7bd
100 annotations by chatgpt on successful and unsuccessful spider data…
MartinRiddell Feb 15, 2023
2780b01
100 annotations by chatgpt on unsuccessful spider dataset
MartinRiddell Feb 15, 2023
faeef72
added dataset of codex's failures on spider
MartinRiddell Feb 15, 2023
c912865
creating a new branch for this notebook
MartinRiddell Feb 23, 2023
6702aff
playing with result processing
MartinRiddell Mar 2, 2023
c3fe0d3
cleaned up a bit
MartinRiddell Mar 28, 2023
a98e6f8
implemented rudimentary human evaluation for model's results on squa…
MartinRiddell Apr 13, 2023
5ca8911
added doc string and a few comments in the code
MartinRiddell Apr 13, 2023
e10f6ab
start_eval can now handle datasets with the same keys as the gsmath d…
MartinRiddell Apr 14, 2023
cbecc01
commented out print statement, and added a check for ERROR messages i…
MartinRiddell Apr 14, 2023
cad6fff
better error handling
MartinRiddell Apr 15, 2023
df59973
added 'big difference' and 'unclear instructions' to the reasons for …
MartinRiddell Apr 15, 2023
29c7552
better error handling. Forgot to add this in previous commit
MartinRiddell Apr 15, 2023
7576f80
began human evaluation of codex on gsmath dataset
MartinRiddell Apr 15, 2023
5270c76
added an extra error to human eval files, and finished evaluating 100…
MartinRiddell Apr 24, 2023
ac8a222
small changes to evaluation script, and eval_report can be used to su…
MartinRiddell May 11, 2023
2a76777
finished evaluating 100 mbpp and gsmath, 50 wtq problems answered by …
MartinRiddell May 11, 2023
e82a2fc
added rekey script. It's not great, but it gives the indices of the q…
MartinRiddell May 12, 2023
17b4556
stephen codex cushman on gsm8k evals
StephenYin01 May 14, 2023
d924d73
finished gpt4 evaluation on GSM8k
StephenYin01 May 16, 2023
577d275
finished human eval of starcoder on gsm8k
StephenYin01 May 16, 2023
8970746
added a few evaluations that are done. Spider_gpt4 isn't done, but pu…
MartinRiddell May 21, 2023
77cc573
resolving merge
MartinRiddell May 21, 2023
6870fcb
finish spider starcoder evals (2 shot)
StephenYin01 May 23, 2023
17a28c1
finished codex cushman eval on spider
StephenYin01 May 24, 2023
afcc894
added evals for gpt4 and davinci models on spider dataset
MartinRiddell May 24, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39,672 changes: 39,672 additions & 0 deletions Result_Processing.ipynb

Large diffs are not rendered by default.

Empty file added annotation/__init__.py
Empty file.
169 changes: 169 additions & 0 deletions annotation/annotation_tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
import json
import os
import random
import sqlite3
import time

import pandas as pd

from overrides import overrides
from typing import List, Dict, Any, Tuple

from execution.executors import WTQPythonExecutor, SpiderPythonExecutor
from execution.spider_execution import spider_execution_pd_sql, pd_df_to_dict, spider_execution_py, db_to_df_dict

class AnnotationTask:
def __init__(self, data_file, output_file):
self.data_file = data_file
self.output_file = output_file

# read the examples to annotate
self.examples = []
self.read_data()
self.postprocess_data()

# create the output file if it does not exist
if not os.path.isfile(output_file):
self.annotation_indices = self.get_annotation_indices()
print(f"Creating new output file {output_file} for {len(self.annotation_indices)} examples")
with open(output_file, "w+") as f:
annotation_metadata = {"data_file": self.data_file,
"total_num_examples": len(self.examples),
"annotation_indices": self.annotation_indices}
f.write(json.dumps(annotation_metadata) + "\n")

# keep track of the progress
self.annotated_examples = []
else:
# recovery the metadata and the annotated examples
self.recovery_progress(output_file)

def postprocess_data(self):
return

def read_data(self):
with open(self.data_file, "r") as f:
self.examples = [json.loads(s) for s in f.readlines()]

def get_annotation_indices(self) -> List[int]:
return list(range(len(self.examples)))

def recovery_progress(self, output_file: str):
# first recover the annotation progress from the file
with open(output_file, "r") as f:
lines = f.readlines()
annotation_metadata = json.loads(lines[0])
self.annotated_examples = [json.loads(s) for s in lines[1:]]
self.annotation_indices = annotation_metadata["annotation_indices"]

# then verify the annotated examples to match the data file
for i, example in enumerate(self.annotated_examples):
assert example["metadata"] == self.examples[self.annotation_indices[i]], \
f"Annotated example does not match the data file"

print(f"Recovered progress from {output_file} for {len(self.annotated_examples)} out of {len(self.annotation_indices)} total examples")
time.sleep(2)

def save_single_annotation(self, example: Dict[str, Any], annotation: str, exec_result: Any = None):
save_example = {"metadata": example, "annotation": annotation, "exec_result": exec_result}

# save both to the output file and the annotated examples
self.annotated_examples.append(save_example)
with open(self.output_file, "a") as f:
f.write(json.dumps(save_example) + "\n")

def get_and_display_next_example(self):
next_example_idx = self.annotation_indices[len(self.annotated_examples)]
print("\033[1;7;34m" + '#' * 20 + f" Example {next_example_idx} " + '#' * 20 + "\033[0m")
self.display_example(self.examples[next_example_idx])
print("\033[1;7;34m" + '#' * 40 + "\033[0m")

return self.examples[next_example_idx]

def display_example(self, example: Dict[str, Any]):
raise NotImplementedError("Please implement this method in the subclass")

def check_annotation_correctness(self, example: Dict[str, Any], annotation: str) -> Tuple[bool, str]:
raise NotImplementedError("Please implement this method in the subclass")

def get_annotation_instructions(self, example: Dict[str, Any]) -> str:
return "Enter annotation (or `exit`/`skip`): "

class SQL2PandasAnnotationTask(AnnotationTask):
def __init__(self, dataset_name: str, annotation_size: int=100):
# init the parameters
assert dataset_name in ["spider", "squall"], f"Invalid dataset name {dataset_name}"
self.dataset_name = dataset_name
self.annotation_size = annotation_size
self.executor = WTQPythonExecutor() if dataset_name == "squall" else SpiderPythonExecutor()

data_file_name = "data/spider/train_spider_processed_v2.jsonl" if dataset_name == "spider" \
else "data/squall/squall_processed_train_all.jsonl"
output_file_name = f"{os.getlogin()}_{dataset_name}_annotation.jsonl"

# init the base class
super().__init__(data_file_name, output_file_name)

@overrides
def postprocess_data(self):
# add db_path to the examples
for example in self.examples:
if self.dataset_name == "spider":
example["db_path"] = os.path.join("data/spider/database", example["db_id"], f'{example["db_id"]}.sqlite')
elif self.dataset_name == "squall":
example["db_path"] = os.path.join("data/squall/tables/db", example["db_id"] + ".db")
else:
raise ValueError(f"Unknown dataset name {self.dataset_name}")

@overrides
def get_annotation_indices(self) -> List[int]:
all_indices = list(range(len(self.examples)))
random.shuffle(all_indices)

return all_indices[:self.annotation_size]

def display_database(self, db_path: str):
conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
df_dict = {}
for table_name in conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall():
df_dict[table_name[0]] = pd.read_sql_query(f"SELECT * FROM {table_name[0]}", conn)

for table_name, df in df_dict.items():
print('-' * 50)
print(f"Table: {table_name}, Shape: {df.shape[0]} rows, {df.shape[1]} columns")
df_to_print = df.head(5)
print(df_to_print)
print('-' * 50)
print("NOTE: Only the first 5 rows are shown!!!")

@overrides
def display_example(self, example: Dict[str, Any]) -> str:
print("Database:")
self.display_database(example['db_path'])
print(f"Question: {example['question']}")
print(f"SQL Query: {example['query']}")

@overrides
def check_annotation_correctness(self, example: Dict[str, Any], annotation: str) -> Tuple[bool, str]:
annotated_sql = annotation.strip()
exec_match, exec_result = self.executor.exec_program(annotated_sql, example)
if exec_match == 1:
return True, f"{exec_result}"
else:
expected_answer = example['answer'] if self.dataset_name == "spider" else example['original_answer']
return False, f"Expected: {expected_answer} but got {exec_result}"

@overrides
def get_annotation_instructions(self, example: Dict[str, Any]) -> str:
basic_prompt = "Enter annotation (or `exit`/`skip`), for python use `;` to seperate lines:\n"

# construct the example specific prompt
conn = sqlite3.connect(example["db_path"])
df_dict = db_to_df_dict(conn)
table_vars_code = "import pandas as pd\n"
for table_name in df_dict.keys():
table_vars_code += f"# {' '.join(list(df_dict[table_name].columns))}\n{table_name} = df_dict['{table_name}']\n"
example_prompt = "; ".join(list(filter(lambda x: not x.startswith("#"), table_vars_code.split("\n"))))

return basic_prompt + example_prompt

50 changes: 50 additions & 0 deletions annotation/start_annotate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import os
import time

from annotation.annotation_tasks import SQL2PandasAnnotationTask

ANNOTAION_TASKS = ["spider", "squall"]

def main():
input("Press Enter to start annotation...")
task_name = input("Select the dataset to annotate (spider/squall): ")
task = SQL2PandasAnnotationTask(dataset_name=task_name)

same_example = False
last_annotation = None
while True:
if not same_example:
example = task.get_and_display_next_example()
annotation = input(task.get_annotation_instructions(example))
if annotation == "exit":
break
elif annotation == "skip":
task.save_single_annotation(example, annotation)
same_example = False
elif annotation == "override" and last_annotation is not None:
exec_match, exec_info = task.check_annotation_correctness(example, last_annotation)
task.save_single_annotation(example, last_annotation, exec_result=exec_info)
same_example = False
else:
exec_match, exec_info = task.check_annotation_correctness(example, annotation)
last_annotation = annotation
if exec_match:
same_example = False
print("\033[1m" + " RESULT... " + "\033[0m")
print("\033[42m" + "Correct! Returned: " + exec_info + "\033[0m")
save = input("Press \033[33mENTER\033[0m to save this annotation, or type `\033[33mcancel\033[0m` to improve this annotation: ")
print("_________________________________________________________________________________________________________")
if save == "cancel":
same_example = True
else:
task.save_single_annotation(example, annotation, exec_result=exec_info)
else:
print("\033[1m" + " RESULT... " + "\033[0m")
print("\033[41m" + exec_info + "\033[0m")
print("Annotation is not correct. More information: (if you believe the annotation is correct, enter `\033[33moverride\033[0m`)")
print("_________________________________________________________________________________________________________")
same_example = True


if __name__ == '__main__':
main()
65 changes: 65 additions & 0 deletions eval_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import argparse
import json


parser = argparse.ArgumentParser()
parser.add_argument("dataset", type=str, help="The dataset to evaluate on.")
args = parser.parse_args()

dataset = args.dataset

print("Starting report on dataset: {}".format(dataset))

failed_options = ['missing', 'extra', 'subtle', 'unclear']
successfull_options = ['spurious', 'same', 'different']
error_options = ['ERROR: program failed to execute', 'ERROR: no answer variable']

failed_dict = {}
success_dict = {}
error_dict = {}

for key in failed_options:
failed_dict[key] = []

for key in successfull_options:
success_dict[key] = []

for key in error_options:
error_dict[key] = []

with open(dataset, 'r') as f:
examples = [json.loads(s) for s in f.readlines()]

for example in examples[1:]: # the first example is the header
evaluation = example['evaluation']
if evaluation in failed_options:
failed_dict[evaluation].append(example)

elif evaluation in successfull_options:
success_dict[evaluation].append(example)

elif evaluation in error_options:
error_dict[evaluation].append(example)


print("\033[1;31mNumber of failed examples: \033[0m")
failed = 0
for key in failed_dict:
failed += len(failed_dict[key])
print("\t{}: {}".format(key, len(failed_dict[key])))
print("Total Failed: {}".format(failed))

print("\033[1;32mNumber of successfull examples: \033[0m")
success = 0
for key in success_dict:
success += len(success_dict[key])
print("\t{}: {}".format(key, len(success_dict[key])))
print("Total Success: {}".format(success))


print("\033[1;33mNumber of error examples: \033[0m")
error = 0
for key in error_dict:
error += len(error_dict[key])
# currently all errors are being saved under one header
print("Total Errors: {}".format(error))
57 changes: 53 additions & 4 deletions execution/executors.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
import os
import time
import ast
import sqlite3

from overrides import overrides
from typing import List, Any, Tuple, Dict, Set, Union
from concurrent.futures import ProcessPoolExecutor as Pool
from execution.program_tracing import assertion_to_test

from execution.spider_official_exec_match import eval_exec_match
from execution.spider_execution import spider_execution_pd_sql, pd_df_to_dict
from execution.spider_execution import spider_execution_pd_sql, pd_df_to_dict, spider_execution_py, db_to_df_dict, spider_answer_eq
from execution.safe_execution_util import execute
from execution.program_tracing import get_function_final_state
from execution.wtq_eval import wtq_execution_sql, wtq_answer_eq

"""
From the models' perspective, the model would only want two things:
Expand Down Expand Up @@ -161,9 +163,7 @@ def process_output(self, output: str, tokenizer_eos_token: str) -> str:

@staticmethod
def real_exec_program(program: str, example: Dict[str, Any]) -> Tuple[int, Union[str, List, Dict]]:
db_path = os.path.join("data/spider/database", example["db_id"], f'{example["db_id"]}.sqlite')

raw_exec_match_result = eval_exec_match(db_path, db_path, program,
raw_exec_match_result = eval_exec_match(example['db_path'], example['db_path'], program,
example["query"], plug_value=False,
keep_distinct=False,
progress_bar_for_each_datapoint=False)
Expand All @@ -181,6 +181,55 @@ def real_exec_program(program: str, example: Dict[str, Any]) -> Tuple[int, Union

return exec_match_result, exec_result_store

class WTQExecutor(SpiderExecutor):
def __init__(self, **kwargs):
super().__init__(**kwargs)

@staticmethod
def real_exec_program(program: str, example: Dict[str, Any]) -> Tuple[int, Union[str, List, Dict]]:
exec_results = wtq_execution_sql(program, example)

if exec_results is not None:
return -1, "ERROR"
else:
exec_match_result = int(wtq_answer_eq(exec_results, example["original_answer"]))
return exec_match_result, exec_results

class SpiderPythonExecutor(SpiderExecutor):
def __init__(self, **kwargs):
super().__init__(**kwargs)

@staticmethod
def real_exec_program(program: str, example: Dict[str, Any]) -> Tuple[int, Union[str, List, Dict]]:
# get the table name -> dataframe dict
conn = sqlite3.connect(example["db_path"])
df_dicts = db_to_df_dict(conn)

# execute the program
exec_result = spider_execution_py(program, df_dicts)
if "order" in example["query_toks_no_value"]:
exec_match_result = int(spider_answer_eq(exec_result, example["answer"]))
else:
exec_match_result = int(spider_answer_eq(exec_result, example["answer"], True))

return exec_match_result, exec_result

class WTQPythonExecutor(SpiderExecutor):
def __init__(self, **kwargs):
super().__init__(**kwargs)

@staticmethod
def real_exec_program(program: str, example: Dict[str, Any]) -> Tuple[int, Union[str, List, Dict]]:
# get the table name -> dataframe dict
conn = sqlite3.connect(example["db_path"])
df_dicts = db_to_df_dict(conn)

# execute the program
exec_result = spider_execution_py(program, df_dicts)
exec_match_result = int(wtq_answer_eq([[exec_result]], example["original_answer"]))

return exec_match_result, exec_result

class MBPPExecutor(BaseExecutor):
def __init__(self, **kwargs):
super().__init__(**kwargs)
Expand Down
Loading