Deconstructing-Census-Data/SVM.py at master · RichardMuniu/Deconstructing-Census-Data · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
"""
Uses one-vs-one SVM to train a model that estimates age
from
Authors: Kenny, Raymond, Rick
Date: 5/1/2019
"""
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import svm
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import matplotlib.pyplot as plt
import numpy as np

class SVM:

    def __init__(self):
        """
        Initializes the SVM classifier
        """
        self.clf = svm.LinearSVC(C=0.1, random_state=42)
        self.svc_test_scores = None

    def runTuneTest(self, params, X, y):
        """
        Purpose - Tunes the SVC classifiers hyper-parameters.
                  This method handles creation of train/tune/test sets and runs
                  the pipeline, then reports the scores of using various sets
                  of hyperparameters. We are using 5 folds here.
        Params -  params: a dictionary of hyperparameters for GridSearchCV
                  X: the training data
                  y: the training labels
        Returns - The scores using each distinct hyperparameter value.
        """

        print("\nTuning SVC...\n")
        fold_num = 1
        test_scores = []
        stratifier = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        for train_index, test_index in stratifier.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            # change y into an np array
            y =np.array(y)
            y_train, y_test = y[train_index], y[test_index]
            clf = GridSearchCV(self.clf, params)
            clf.fit(X_train, y_train)
            score = clf.score(X_train, y_train)
            print("Fold number: ", fold_num)
            print("Best Parameter: ", clf.best_params_)
            print("Training score: ", score, "\n")
            test_scores.append(clf.score(X_test, y_test))
            fold_num += 1

        self.svc_test_scores = test_scores

    def printTestScores(self):
        """
        Purpose - prints out the scores from the tune test
        Params -  none
        Returns - nothing, but has the side effect of printing the
                  tune test scores for the different folds
        """
        print("---------------------------------------------")
        print("Fold    SVM Test Accuracy")
        for i in range(5):
            print("%4d %19.3f" % (i, self.svc_test_scores[i]))
        print("\n")

    def trainSVM(self, XTrain, yTrain):
        """
        Purpose - trains the SVM classifier
        Param - XTrain - training data
                yTrain - training labels
        Return - clf - the trained classifier
        """
        # Train the classifier on the training data
        print("Training...")
        self.clf.fit(XTrain, yTrain)
        print("Done training.\n")

    def testSVM(self, XTest):
        """
        Purpose - runs the NB model on the test data
        Param - clf - trained clasifier
                XTest - test data
        Return - yPred - list of predicted labels
        """

        #creates list of predicted labels
        yPred = self.clf.predict(XTest)

        return yPred

    def evaluate(self, XTest, yTrue, yPred):
        """
        Purpose - gives us our test accuracy and confusion matrix
        Param   - yTrue - list of actual test labels
                  yPred - list of predicted test labels
        Returns - svmScore: score of the SVM classifier
        """
        # calculates the accuracy score on  the test data
        svmScore = self.clf.score(XTest, yTrue)
        print("SVM classifer score was ", svmScore)
        # prints the confusion matrix
        # now call confusion matrix method
        print("\nConfusion Matrix")
        print(confusion_matrix(yTrue, yPred))

        return svmScore

    def visualizeWeights(self, features, feat_means):
        """
        Purpose - draws bar graphs that help analyse feature importance using
                  corresponding feature weights from our trained classifier.
                  We have 32 features in total, but we will analyse the top 10
                  most important features
        Params -  features: a list of length p containing the feature names for
                  our SVM classifier
        Returns - Nothing, prints top features for each model and displays a
                  graphical representation for each
        """
        weights = self.clf.coef_
        y_pos = np.arange(5)


        # calculate (w*x) to see relative impact of weight on class means
        wT_x = np.multiply(weights, feat_means)
        wT_x = np.reshape(wT_x, (87,))

        feat_impact_dict = {}
        original_vals = {}
        for i in range(len(features)):
            # use absolute values for sorting
            feat_impact_dict[features[i]] = abs(wT_x[i])
            # save original values for later analysis
            original_vals[features[i]] = wT_x[i]

        sorted_feat_impacts = sorted(feat_impact_dict.items(), key=lambda kv: kv[1], reverse=True)

        top_features = [pair[0] for pair in sorted_feat_impacts]
        top_5_features = top_features[:5]
        top_5_wTx = []

        print("\nHere are the top 5 most impactful features (SVM, bin. classification):\n")
        for feature in top_5_features:
            original_impact_val = original_vals[feature]
            top_5_wTx.append(original_impact_val)
            print("%s : %f" % (feature, original_impact_val))
        print("---------------------------------------------------\n")

        # graphically,
        plt.bar(y_pos, top_5_wTx,  align='center', width=0.5)
        plt.xticks(y_pos, top_5_features)
        plt.ylabel("wT * x")
        title = "SVM Feature Analysis: Binary Classification"
        plt.title(title)
        plt.show()