Yale-LILY · MartinRiddell · Oct 10, 2022 · Oct 12, 2022 · Nov 2, 2022 · Nov 3, 2022
diff --git a/Result_Processing.ipynb b/Result_Processing.ipynb
diff --git a/annotation/__init__.py b/annotation/__init__.py
diff --git a/annotation/annotation_tasks.py b/annotation/annotation_tasks.py
@@ -0,0 +1,169 @@
+import json
+import os
+import random
+import sqlite3
+import time
+
+import pandas as pd
+
+from overrides import overrides
+from typing import List, Dict, Any, Tuple
+
+from execution.executors import WTQPythonExecutor, SpiderPythonExecutor
+from execution.spider_execution import spider_execution_pd_sql, pd_df_to_dict, spider_execution_py, db_to_df_dict
+
+class AnnotationTask:
+    def __init__(self, data_file, output_file):
+        self.data_file = data_file
+        self.output_file = output_file
+
+        # read the examples to annotate
+        self.examples = []
+        self.read_data()
+        self.postprocess_data()
+
+        # create the output file if it does not exist
+        if not os.path.isfile(output_file):
+            self.annotation_indices = self.get_annotation_indices()
+            print(f"Creating new output file {output_file} for {len(self.annotation_indices)} examples")
+            with open(output_file, "w+") as f:
+                annotation_metadata = {"data_file": self.data_file, 
+                                       "total_num_examples": len(self.examples),
+                                       "annotation_indices": self.annotation_indices}
+                f.write(json.dumps(annotation_metadata) + "\n")
+
+            # keep track of the progress
+            self.annotated_examples = []
+        else:
+            # recovery the metadata and the annotated examples
+            self.recovery_progress(output_file)
+
+    def postprocess_data(self):
+        return
+
+    def read_data(self):
+        with open(self.data_file, "r") as f:
+            self.examples = [json.loads(s) for s in f.readlines()]
+
+    def get_annotation_indices(self) -> List[int]:
+        return list(range(len(self.examples)))
+
+    def recovery_progress(self, output_file: str):
+        # first recover the annotation progress from the file
+        with open(output_file, "r") as f:
+            lines = f.readlines()
+            annotation_metadata = json.loads(lines[0])
+            self.annotated_examples = [json.loads(s) for s in lines[1:]]
+            self.annotation_indices = annotation_metadata["annotation_indices"]
+
+        # then verify the annotated examples to match the data file
+        for i, example in enumerate(self.annotated_examples):
+            assert example["metadata"] == self.examples[self.annotation_indices[i]], \
+                f"Annotated example does not match the data file"
+
+        print(f"Recovered progress from {output_file} for {len(self.annotated_examples)} out of {len(self.annotation_indices)} total examples")
+        time.sleep(2)
+
+    def save_single_annotation(self, example: Dict[str, Any], annotation: str, exec_result: Any = None):
+        save_example = {"metadata": example, "annotation": annotation, "exec_result": exec_result}
+
+        # save both to the output file and the annotated examples
+        self.annotated_examples.append(save_example)
+        with open(self.output_file, "a") as f:
+            f.write(json.dumps(save_example) + "\n")
+
+    def get_and_display_next_example(self):
+        next_example_idx = self.annotation_indices[len(self.annotated_examples)]
+        print("\033[1;7;34m" + '#' * 20 + f" Example {next_example_idx} " + '#' * 20 + "\033[0m")
+        self.display_example(self.examples[next_example_idx])
+        print("\033[1;7;34m" + '#' * 40 + "\033[0m")
+
+        return self.examples[next_example_idx]
+
+    def display_example(self, example: Dict[str, Any]):
+        raise NotImplementedError("Please implement this method in the subclass")
+
+    def check_annotation_correctness(self, example: Dict[str, Any], annotation: str) -> Tuple[bool, str]:
+        raise NotImplementedError("Please implement this method in the subclass")
+
+    def get_annotation_instructions(self, example: Dict[str, Any]) -> str:
+        return "Enter annotation (or `exit`/`skip`): "
+
+class SQL2PandasAnnotationTask(AnnotationTask):
+    def __init__(self, dataset_name: str, annotation_size: int=100):
+        # init the parameters
+        assert dataset_name in ["spider", "squall"], f"Invalid dataset name {dataset_name}"
+        self.dataset_name = dataset_name
+        self.annotation_size = annotation_size
+        self.executor = WTQPythonExecutor() if dataset_name == "squall" else SpiderPythonExecutor()
+
+        data_file_name = "data/spider/train_spider_processed_v2.jsonl" if dataset_name == "spider" \
+            else "data/squall/squall_processed_train_all.jsonl"
+        output_file_name = f"{os.getlogin()}_{dataset_name}_annotation.jsonl"
+
+        # init the base class
+        super().__init__(data_file_name, output_file_name)
+
+    @overrides
+    def postprocess_data(self):
+        # add db_path to the examples
+        for example in self.examples:
+            if self.dataset_name == "spider":
+                example["db_path"] = os.path.join("data/spider/database", example["db_id"], f'{example["db_id"]}.sqlite')
+            elif self.dataset_name == "squall":
+                example["db_path"] = os.path.join("data/squall/tables/db", example["db_id"] + ".db")
+            else:
+                raise ValueError(f"Unknown dataset name {self.dataset_name}")
+
+    @overrides
+    def get_annotation_indices(self) -> List[int]:
+        all_indices = list(range(len(self.examples)))
+        random.shuffle(all_indices)
+
+        return all_indices[:self.annotation_size]
+
+    def display_database(self, db_path: str):
+        conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
+        df_dict = {}
+        for table_name in conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall():
+            df_dict[table_name[0]] = pd.read_sql_query(f"SELECT * FROM {table_name[0]}", conn)
+
+        for table_name, df in df_dict.items():
+            print('-' * 50)
+            print(f"Table: {table_name}, Shape: {df.shape[0]} rows, {df.shape[1]} columns")
+            df_to_print = df.head(5)
+            print(df_to_print)
+            print('-' * 50)
+            print("NOTE: Only the first 5 rows are shown!!!")
+
+    @overrides
+    def display_example(self, example: Dict[str, Any]) -> str:
+        print("Database:")
+        self.display_database(example['db_path'])
+        print(f"Question: {example['question']}")
+        print(f"SQL Query: {example['query']}")
+
+    @overrides
+    def check_annotation_correctness(self, example: Dict[str, Any], annotation: str) -> Tuple[bool, str]:
+        annotated_sql = annotation.strip()
+        exec_match, exec_result = self.executor.exec_program(annotated_sql, example)
+        if exec_match == 1:
+            return True, f"{exec_result}"
+        else:
+            expected_answer = example['answer'] if self.dataset_name == "spider" else example['original_answer']
+            return False, f"Expected: {expected_answer} but got {exec_result}"
+
+    @overrides
+    def get_annotation_instructions(self, example: Dict[str, Any]) -> str:
+        basic_prompt = "Enter annotation (or `exit`/`skip`), for python use `;` to seperate lines:\n"
+
+        # construct the example specific prompt
+        conn = sqlite3.connect(example["db_path"])
+        df_dict = db_to_df_dict(conn)
+        table_vars_code = "import pandas as pd\n"
+        for table_name in df_dict.keys():
+            table_vars_code += f"# {' '.join(list(df_dict[table_name].columns))}\n{table_name} = df_dict['{table_name}']\n"
+        example_prompt = "; ".join(list(filter(lambda x: not x.startswith("#"), table_vars_code.split("\n"))))
+
+        return basic_prompt + example_prompt
+
diff --git a/annotation/start_annotate.py b/annotation/start_annotate.py
@@ -0,0 +1,50 @@
+import os
+import time
+
+from annotation.annotation_tasks import SQL2PandasAnnotationTask
+
+ANNOTAION_TASKS = ["spider", "squall"]
+
+def main():
+    input("Press Enter to start annotation...")
+    task_name = input("Select the dataset to annotate (spider/squall): ")
+    task = SQL2PandasAnnotationTask(dataset_name=task_name)
+
+    same_example = False
+    last_annotation = None
+    while True:
+        if not same_example:
+            example = task.get_and_display_next_example()
+        annotation = input(task.get_annotation_instructions(example))
+        if annotation == "exit":
+            break
+        elif annotation == "skip":
+            task.save_single_annotation(example, annotation)
+            same_example = False
+        elif annotation == "override" and last_annotation is not None:
+            exec_match, exec_info = task.check_annotation_correctness(example, last_annotation)
+            task.save_single_annotation(example, last_annotation, exec_result=exec_info)
+            same_example = False
+        else: 
+            exec_match, exec_info = task.check_annotation_correctness(example, annotation)
+            last_annotation = annotation
+            if exec_match:
+                same_example = False
+                print("\033[1m" + " RESULT... " + "\033[0m")
+                print("\033[42m" + "Correct! Returned: " + exec_info + "\033[0m")
+                save = input("Press \033[33mENTER\033[0m to save this annotation, or type `\033[33mcancel\033[0m` to improve this annotation: ")
+                print("_________________________________________________________________________________________________________")
+                if save == "cancel":
+                    same_example = True
+                else:
+                    task.save_single_annotation(example, annotation, exec_result=exec_info)
+            else:
+                print("\033[1m" + " RESULT... " + "\033[0m")
+                print("\033[41m" + exec_info + "\033[0m")
+                print("Annotation is not correct. More information: (if you believe the annotation is correct, enter `\033[33moverride\033[0m`)")
+                print("_________________________________________________________________________________________________________")
+                same_example = True
+
+
+if __name__ == '__main__':
+    main()
diff --git a/eval_report.py b/eval_report.py
@@ -0,0 +1,65 @@
+import argparse
+import json
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("dataset", type=str, help="The dataset to evaluate on.")
+args = parser.parse_args()
+
+dataset = args.dataset
+
+print("Starting report on dataset: {}".format(dataset))
+
+failed_options = ['missing', 'extra', 'subtle', 'unclear']
+successfull_options = ['spurious', 'same', 'different']
+error_options = ['ERROR: program failed to execute', 'ERROR: no answer variable']
+
+failed_dict = {}
+success_dict = {}
+error_dict = {}
+
+for key in failed_options:
+    failed_dict[key] = []
+
+for key in successfull_options:
+    success_dict[key] = []
+
+for key in error_options:
+    error_dict[key] = []
+
+with open(dataset, 'r') as f:
+    examples = [json.loads(s) for s in f.readlines()]
+
+    for example in examples[1:]:       # the first example is the header
+        evaluation = example['evaluation']
+        if evaluation in failed_options:
+            failed_dict[evaluation].append(example)
+
+        elif evaluation in successfull_options:
+            success_dict[evaluation].append(example)
+
+        elif evaluation in error_options:
+            error_dict[evaluation].append(example)
+
+
+print("\033[1;31mNumber of failed examples: \033[0m")
+failed = 0
+for key in failed_dict:
+    failed += len(failed_dict[key])
+    print("\t{}: {}".format(key, len(failed_dict[key])))
+print("Total Failed: {}".format(failed))
+
+print("\033[1;32mNumber of successfull examples: \033[0m")
+success = 0
+for key in success_dict:
+    success += len(success_dict[key])
+    print("\t{}: {}".format(key, len(success_dict[key])))
+print("Total Success: {}".format(success))
+
+
+print("\033[1;33mNumber of error examples: \033[0m")
+error = 0
+for key in error_dict:
+    error += len(error_dict[key])
+    # currently all errors are being saved under one header
+print("Total Errors: {}".format(error))
diff --git a/execution/executors.py b/execution/executors.py
@@ -1,16 +1,18 @@
 import os
 import time
 import ast
+import sqlite3
 
 from overrides import overrides
 from typing import List, Any, Tuple, Dict, Set, Union
 from concurrent.futures import ProcessPoolExecutor as Pool
 from execution.program_tracing import assertion_to_test
 
 from execution.spider_official_exec_match import eval_exec_match
-from execution.spider_execution import spider_execution_pd_sql, pd_df_to_dict
+from execution.spider_execution import spider_execution_pd_sql, pd_df_to_dict, spider_execution_py, db_to_df_dict, spider_answer_eq
 from execution.safe_execution_util import execute
 from execution.program_tracing import get_function_final_state
+from execution.wtq_eval import wtq_execution_sql, wtq_answer_eq
 
 """
 From the models' perspective, the model would only want two things: 
@@ -161,9 +163,7 @@ def process_output(self, output: str, tokenizer_eos_token: str) -> str:
 
     @staticmethod
     def real_exec_program(program: str, example: Dict[str, Any]) -> Tuple[int, Union[str, List, Dict]]:
-        db_path = os.path.join("data/spider/database", example["db_id"], f'{example["db_id"]}.sqlite')
-
-        raw_exec_match_result = eval_exec_match(db_path, db_path, program, 
+        raw_exec_match_result = eval_exec_match(example['db_path'], example['db_path'], program, 
                                             example["query"], plug_value=False, 
                                             keep_distinct=False,
                                             progress_bar_for_each_datapoint=False)
@@ -181,6 +181,55 @@ def real_exec_program(program: str, example: Dict[str, Any]) -> Tuple[int, Union
 
             return exec_match_result, exec_result_store
 
+class WTQExecutor(SpiderExecutor):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    @staticmethod
+    def real_exec_program(program: str, example: Dict[str, Any]) -> Tuple[int, Union[str, List, Dict]]:
+        exec_results = wtq_execution_sql(program, example)
+
+        if exec_results is not None:
+            return -1, "ERROR"
+        else:
+            exec_match_result = int(wtq_answer_eq(exec_results, example["original_answer"]))
+            return exec_match_result, exec_results
+
+class SpiderPythonExecutor(SpiderExecutor):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    @staticmethod
+    def real_exec_program(program: str, example: Dict[str, Any]) -> Tuple[int, Union[str, List, Dict]]:
+        # get the table name -> dataframe dict
+        conn = sqlite3.connect(example["db_path"])
+        df_dicts = db_to_df_dict(conn)
+
+        # execute the program
+        exec_result = spider_execution_py(program, df_dicts)
+        if "order" in example["query_toks_no_value"]:
+            exec_match_result = int(spider_answer_eq(exec_result, example["answer"]))
+        else:
+            exec_match_result = int(spider_answer_eq(exec_result, example["answer"], True))
+
+        return exec_match_result, exec_result
+
+class WTQPythonExecutor(SpiderExecutor):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    @staticmethod
+    def real_exec_program(program: str, example: Dict[str, Any]) -> Tuple[int, Union[str, List, Dict]]:
+        # get the table name -> dataframe dict
+        conn = sqlite3.connect(example["db_path"])
+        df_dicts = db_to_df_dict(conn)
+
+        # execute the program
+        exec_result = spider_execution_py(program, df_dicts)
+        exec_match_result = int(wtq_answer_eq([[exec_result]], example["original_answer"]))
+
+        return exec_match_result, exec_result
+
 class MBPPExecutor(BaseExecutor):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)