Deconstructing-Census-Data/main.py at master · RichardMuniu/Deconstructing-Census-Data · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""
File for running our algorithms on the dataset
Authors: Kenny, Raymond, Real Rick
Date: 4/25/2019
"""

from preprocess import *
from sklearn import svm
from sklearn.metrics import confusion_matrix, accuracy_score
from SVM import *
from naiveBayes import *
from decisionTrees import *
import MFC as MFC
import optparse, sys
import matplotlib.pyplot as plt

def parse_opts(opts, parser):
    """
    Purpose - Parses command line arguments to decide whether or not to
              perform binary or multiclass classification
    Params  - opts: command line arguments
              parser: parser object
    Returns - boolean, true if user wants a binary classification task, false
              otherwise
    """
    mandatory = 'task'
    if not opts.__dict__[mandatory]:
        print('mandatory option ' + mandatory + ' is missing!\n')
        parser.print_help()
        sys.exit(1)

    # now figure out which task to do
    task = opts.task
    if task == 'binary':
        return True
    elif task == 'multiclass':
        return False
    else:
        print("ERROR: Unrecognized task! Use options 'binary' or 'multiclass'\n")
        parser.print_help()
        sys.exit(1)

def main():

    parser = optparse.OptionParser(description='main.py')
    parser.add_option('-t', '--task', \
        help="use 'binary' for binary classification, 'multiclass' for multiclass")

    opts = parser.parse_args()[0]

    binary = parse_opts(opts, parser)
    data = Data("data/adult.data")
    if binary:
        data.readData(binary=True)
    else:
        data.readData()

    data.createSVMDataset()
    print("\n====================================================")
    print("SVM Revving Up...")
    print("====================================================")
    #creates multi-class SVM model
    svmClassifier = SVM()
    #train SVM model
    svmClassifier.trainSVM(data.SVMTrain, data.yTrain)
    #evaluate test data
    predictions = svmClassifier.testSVM(data.SVMTest)
    # evaluate accuracy and print confusoin matrix
    svm_score = svmClassifier.evaluate(data.SVMTest, data.yTest, predictions)

    # feature analysis for binary classification task
    if binary:
        print("\nSee feature importance graph...")
        svmClassifier.visualizeWeights(data.SVMFeatures, data.SVMFeatureMeans)

    # uncomment this code to perform hyperparameter tuning for the SVC
    # classifier
    """
    print("\n====================================================")
    print("\t Starting hyper-parameter tuning...")
    print("====================================================")
    svc_params = {"C": np.logspace(-10, 10, 21)}
    svmClassifier.runTuneTest(svc_params, data.SVMTrain, data.yTrain)
    svmClassifier.printTestScores()
    print("SVM Training Complete!")
    print("====================================================")
    """

    dTreeFeats = data.createNBDataset()

    #Naive Bayes
    print("\n====================================================")
    print("Starting Naive Bayes...")
    print("====================================================")
    print("Making naive assumptions...")
    print("----------------------------------------------------")
    #splits the train and test data into example and labels
    nbTrainX = data.NBdataTrain
    nbTrainY = data.yTrain
    nbTestX = data.NBdataTest
    nbTestY = data.yTest

    #trains the naive bayes classifier
    naiveBayesClassifier = NaiveBayes()
    naiveBayesClassifier.trainNB(nbTrainX, nbTrainY)

    #test our model on the test data and get predictions
    nbPredictions = naiveBayesClassifier.testNB(nbTestX)
    #evaluate the accuracy of nb model
    nb_score = naiveBayesClassifier.evaluate(nbTestY, nbPredictions)

    #DecisionTree
    print("\n====================================================")
    print("Planting Decision Tree Seeds...")
    print("====================================================")
    print("Watering soil...")
    print("----------------------------------------------------")

    #creates decision tree
    decisionTreeClassifier = DecisionTree()
    #split the train and test data into example and labels
    treeTrainX = data.DTreeDataTrain
    treeTrainY = data.yTrain
    treeTestX = data.DTreeDataTest
    treeTestY = data.yTest

    print(len(treeTrainX))

    #train the decision tree model
    decisionTreeClassifier.trainTree(treeTrainX, treeTrainY)

    print("Tree is fully Grown!\n")

    #Test our model on the test data and get predictions
    predictions = decisionTreeClassifier.testTree(treeTestX)
    #evaluate the accuracy
    dtree_score = decisionTreeClassifier.evaluate(treeTestX, treeTestY, predictions)
    #decisionTreeClassifier.visualize(dTreeFeats)

    #MFC
    print("\n====================================================")
    print("Most Frequent Class Baseline")
    print("====================================================")
    mfc_score = MFC.evaluate(nbTrainY, nbTestY)

    print("\nSee overall score graph...\n")
    # plot comparative bar graph
    scores = [svm_score, nb_score, dtree_score, mfc_score]
    x_labels = ['SVM', 'Naive Bayes', 'DTree', 'MFC']
    y_pos = np.arange(4)
    plt.bar(y_pos, scores,  align='center', width=0.5)
    plt.xticks(y_pos, x_labels)
    plt.ylabel("Accuracy")
    plt.title("Accuracy Scores for Different Classifiers")
    plt.show()

    print("====================================================")
    print("PROGRAM COMPLETE!")
    print("====================================================")

if __name__ == "__main__":
    main()