hotel-sentiment-analysis/SentimentAnalysis.py at master · param17/hotel-sentiment-analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#CSCI 5832 Natural Language Processing
#@author Paramjot Singh

#Hotel review sentiment analysis using Naive-Bayes with add one smoothing

import math
import re

def count(train):
    word_dict = {}
    train_file = open(train, 'r')

    for line in train_file:
        #removing all the punctuations to avoid difference between wife and wife,
        #it wont help in sentiment analysis, but will reduce the the size of vocabulary
        line = re.sub('[,.:;?!]','',line)
        #changing words to lower case and splitting based on spaces
        line = line.lower().strip().split()
        line = line[1:]  #to avoid ID value for line

        for word in line:
            if word not in word_dict:
                word_dict[word] = 1
            else:
                word_dict[word] += 1

    train_file.close()

    return word_dict


def prob_calc(train, test_data, train_vocab):

    prob_list = []
    word_count = 0
    for word in train:
        word_count += train[word]

    for word in test_data:
        word_freq = 0
        if word in train:
            word_freq = train[word]
        else:
            word_freq = 0

        prob = (word_freq + 1)/(word_count+len(train_vocab))
        prob_list.append(prob)
    return prob_list

def print_result(result):
    with open('singh-paramjot-assgn3-out.txt', 'w') as file:
        for id in result:
            file.write(id + '\t' + result[id] +'\n')
    file.close()


def sentiment_calc(test_data):
    result = {}
    for line in test_data:
        pos_prob_list = prob_calc(pos_train, test_data[line], train_vocabulary)
        neg_prob_list = prob_calc(neg_train, test_data[line], train_vocabulary)

        #no need to add prior in this case since its same lg(0.5)
        pos_prob = 0
        neg_prob = 0
        for prob in pos_prob_list:
            pos_prob += math.log(prob)

        for prob in neg_prob_list:
            neg_prob += math.log(prob)

        # print(line)
        if pos_prob > neg_prob:
            print(line.upper()+'\tPOS\n')
            result[line.upper()] = 'POS'

        elif neg_prob > pos_prob:
            print(line.upper() + '\tNEG\n')
            result[line.upper()] = 'NEG'

    return result


if __name__ == "__main__":
    #count positive train data words
    pos_train = count('hotelPosT-train.txt')
    #count negative train data words
    neg_train = count('hotelNegT-train.txt')


    # print("Positive dic ")
    # ps = [(k, pos_train[k]) for k in sorted(pos_train, key=pos_train.get, reverse=True)]
    # print(ps)

    # Clipping the high freq words and single occurrence words
    # pos_train = { k:v for k, v in pos_train.items() if v < 100 and v > 1}

    # print("Negative dic ")
    # ns = [(k, neg_train[k]) for k in sorted(neg_train, key=neg_train.get, reverse=True)]
    # print(ns)

    # Clipping the high freq words and single occurrence words
    # neg_train = {k: v for k, v in neg_train.items() if v < 100 and v > 1}

    ps = [(k, pos_train[k]) for k in sorted(pos_train, key=pos_train.get, reverse=True)]
    print(ps)

    ns = [(k, neg_train[k]) for k in sorted(neg_train, key=neg_train.get, reverse=True)]
    print(ns)

    #join both dictionary to form vocabulary
    train_vocabulary = pos_train.copy()
    train_vocabulary.update(neg_train)

    s = [(k, train_vocabulary[k]) for k in sorted(train_vocabulary, key=train_vocabulary.get, reverse=True)]
    print(s)
    # sorted(((v, k) for k, v in train_vocabulary.iteritems()), reverse=True)
    # print(train_vocabulary)

    #get test data
    test_data = {}
    test_file = open('HW3-testset.txt', 'r')
    for line in test_file:
        line = re.sub('[.,;:!?]', '', line)
        line = line.lower().strip().split()

        test_data[line[0]] = line[1:]

    test_file.close()

    result = sentiment_calc(test_data)

    print_result(result)