-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathqstn_classifier_trainer.py
More file actions
82 lines (77 loc) · 2.97 KB
/
qstn_classifier_trainer.py
File metadata and controls
82 lines (77 loc) · 2.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import pickle
import nltk
#download nltk qc corpus using nlt.download()
def find_chunk(sent,chunk_rule=None):
if not chunk_rule:
chunk_rule = 'HCHUNK: <W.*><.*>*?{<N.*>+}'
label=chunk_rule.split(':')[0].strip()
cp = nltk.RegexpParser(chunk_rule)
tree = cp.parse(sent)
for subtree in tree.subtrees():
if subtree.label() == label:
subtree = ' '.join([a[0] for a in subtree ])
print (subtree)
return subtree
##this is required only once
from os.path import expanduser
qc_corpora = expanduser("~") + '/nltk_data/corpora/qc/'
#pre process train.txt dataset
train_qc = open(qc_corpora + 'train.txt' ,'r').readlines()
train_qc = [ i.strip() for i in train_qc ]
res = pos_tag_questions(train_qc)
pickle.dump(res, open("data/qc_train_data_pos_tag.pickle","wb"))
#pre process test.txt dataset
test_qc = open(qc_corpora + 'test.txt' ,'r').readlines()
test_qc = [ i.strip() for i in test_qc ]
res = pos_tag_questions(test_qc)
pickle.dump(res, open("data/qc_test_data_pos_tag.pickle","wb"))
#pos tagging is resource intensive so do it once and save the result
#as a pickle object for future uses
def pos_tag_questions(qstn_list):
res = []
count = 0
for i in qstn_list:
r = []
i = i.split(':')
r.append(i[0])
r.append(i[1].split()[0])
i = i[1].split()
del i[0]
sent = nltk.word_tokenize(' '.join(i))
r.append(nltk.pos_tag(sent))
res.append(tuple(r))
count += 1
if (count % 100) == 0:
print ("processed : " + str(count) )
return res
#experiment with different features to get better accuracy
#also dont forget to to include the same feature extractor in the process_grammar.py
def qstn_feature_extractor(sent):
features={}
for (w,t) in sent:
if t.startswith('W'): #or w == 'EX':
features['qstn_word'] = w.lower()
features['question_focus'] = find_chunk(sent)
features['pos_tags'] = ' '.join([ a[1] for a in sent ])
features['1st verb'] = find_chunk(sent,'VCHUNK: <.*>*?{<V.*>+}<.*>*')
return features
def fine_data_v2(c,f,q):
features = qstn_feature_extractor(q)
coarse = coarse_classifier.classify(features)
features['coarse'] = coarse
return (features,f)
qst = pickle.load(open("data/qc_train_data_pos_tag.pickle", 'rb'))
tst = pickle.load(open("data/qc_test_data_pos_tag.pickle", 'rb'))
qst_coarse = [ (c,q) for (c,f,q) in qst]
qst_coarse_features = [ (qstn_feature_extractor(q),c) for (c,f,q) in qst_coarse]
coarse_classifier = nltk.NaiveBayesClassifier.train(qst_coarse_features)
tst_coarse = [ (c,q) for (c,f,q) in tst]
tst_coarse_features = [ (qstn_feature_extractor(q),c) for (c,f,q) in tst_coarse]
nltk.classify.accuracy(coarse_classifier, tst_coarse_features)
pickle.dump(fine_classifier, open("data/q_coarse_classifier.pickle","wb"))
fine_train_data = [ fine_data(c,f,q) for (c,f,q) in qst]
fine_classifier = nltk.NaiveBayesClassifier.train(fine_train_data)
del fine_train_data
fine_test_data = [ fine_data(c,f,q) for (c,f,q) in tst]
nltk.classify.accuracy(fine_classifier, fine_test_data)
pickle.dump(fine_classifier, open("data/q_fine_classifier.pickle","wb"))