Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions qanta/ingestion/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from qanta.ingestion.preprocess import (
format_qanta_json,
add_sentences_,
add_answer_prompts_,
questions_to_sqlite,
)
from qanta.ingestion.protobowl import compute_question_player_counts
Expand Down Expand Up @@ -160,6 +161,7 @@ def run(self):
with open(QANTA_UNMAPPED_DATASET_PATH) as f:
qanta_questions = json.load(f)["questions"]
add_sentences_(qanta_questions)
add_answer_prompts_(qanta_questions)
with open(QANTA_PREPROCESSED_DATASET_PATH, "w") as f:
json.dump(format_qanta_json(qanta_questions, DS_VERSION), f)

Expand Down
33 changes: 33 additions & 0 deletions qanta/ingestion/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import spacy
import unidecode
import ftfy
import re
from qanta import qlogging
from qanta.spark import create_spark_context

Expand Down Expand Up @@ -79,6 +80,38 @@ def add_sentences_(questions, parallel=True):
# Get the 0th sentence, end character tokenization (tuple position 1)
q["first_sentence"] = text[: tokenization[0][1]]

def extract_prompt(ans):
l_ans = ans.lower()
if "accept" in l_ans or "prompt" in l_ans or "pronounce" in l_ans:
m = re.match(
r"(.+)\((.*(accept|prompt|pronounce).*)\)", ans, flags=re.IGNORECASE
)
if m is not None:
return m.group(2).strip()

m = re.match(
r"(.+)\[(.*(accept|prompt|pronounce).*)\]", ans, flags=re.IGNORECASE
)
if m is not None:
return m.group(2).strip()

return ""
elif "or" in l_ans:
m = re.match(r"(.+)\((.*or.*)\)", ans, flags=re.IGNORECASE)
if m is not None:
return m.group(2).strip()

m = re.match(r"(.+)\[(.*or.*)\]", ans, flags=re.IGNORECASE)
if m is not None:
return m.group(2).strip()

return ""
else:
return ""

def add_answer_prompts_(questions):
for q in questions:
q['answer_prompt'] = extract_prompt(q['answer'])

def questions_to_sqlite(qanta_questions, db_path):
conn = sqlite3.connect(db_path)
Expand Down