-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclassify.py
176 lines (147 loc) · 6.33 KB
/
classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
from featureGen import FeatureGen
from featureSelect import *
from math import ceil
from matplotlib import pyplot
import numpy as np
from numpy.linalg import inv
import random
import sklearn
from sklearn.preprocessing import label_binarize
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
# Analyze mean force for each sensor
def analyzeFeatureMeans(X, Y):
posX = [x for x,y in zip(X,Y) if y == 1]
negX = [x for x,y in zip(X,Y) if y == 0]
print 'PD means: {}'.format(np.mean(posX, axis=0))
print 'Non-PD means: {}'.format(np.mean(negX, axis=0))
# Analyze mean force for PD and nonPD subjects
def analyzeGlobalMeans(X, Y):
a = zip(X, Y)
nonPD = [x[1] for x, y in a if y == 0]
PD = [x[1] for x, y in a if y == 1]
nonPDmean, PDmean = np.mean(nonPD), np.mean(PD)
nonPDvar, PDvar = np.var(nonPD), np.var(PD)
print "Mean: {}, variance: {}".format(nonPDmean, nonPDvar)
print "Mean: {}, variance: {}".format(PDmean, PDvar)
# Run Logistic Regression and plot train and test error
def plotTrainTest(clf, X, Y):
trainingSizes = range(100, 250, 25)
trainErrors, testErrors = [], []
for i in trainingSizes:
trainX, testX = X[:i], X[i:]
trainY, testY = Y[:i], Y[i:]
clf.fit(trainX, trainY)
# Training error
output = clf.predict(trainX)
numWrong = sum([int(predicted != actual) for predicted, actual in zip(output, trainY)])
numTotal = float(len(trainY))
trainErrors.append(numWrong/numTotal)
# Test error
output = clf.predict(testX)
numWrong = sum([int(predicted != actual) for predicted, actual in zip(output, testY)])
numTotal = float(len(testY))
testErrors.append(numWrong/numTotal)
pyplot.plot(trainingSizes, trainErrors, label="Training error")
pyplot.plot(trainingSizes, testErrors, label="Test error")
pyplot.legend()
pyplot.xlabel('Training set size')
pyplot.ylabel('Error')
pyplot.title('{} train and test error vs. training set size'.format(clf.__class__.__name__))
pyplot.show()
def getTrainTestError(clf, X, Y):
i = int(len(X * 0.9))
trainX, testX = X[:i], X[i:]
trainY, testY = Y[:i], Y[i:]
clf.fit(trainX, trainY)
# Training error
output = clf.predict(trainX)
numWrong = sum([int(predicted != actual) for predicted, actual in zip(output, trainY)])
numTotal = float(len(trainY))
trainError = numWrong/numTotal
# Test error
output = clf.predict(testX)
numWrong = sum([int(predicted != actual) for predicted, actual in zip(output, testY)])
numTotal = float(len(testY))
testError = numWrong/numTotal
print "Training error: {}, Test error: {}".format(trainError, testError)
def getTrainTestAUC(clf, X, Y):
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.2, random_state=0)
clf.fit(trainX, trainY)
# Training AUC
output = clf.predict(trainX)
trainingAUC = roc_auc_score(trainY, output)
# Test AUC
output = clf.predict(testX)
testAUC = roc_auc_score(testY, output)
print "Training AUC: {}, Test AUC: {}".format(trainingAUC, testAUC)
return trainingAUC, testAUC
def cross_validate_AUC(clf, X, Y):
scores = cross_val_score(clf, X, Y, cv=10, scoring='roc_auc')
avgAuc = sum(scores)/float(len(scores))
print "Cross val AUC score for {}: {}".format(clf.__class__.__name__, avgAuc)
def cross_validate_accuracy(clf, X, Y):
scores = cross_val_score(clf, X, Y, cv=10)
avgAccuracy = sum(scores)/float(len(scores))
print "Cross val accuracy score for {}: {}".format(clf.__class__.__name__, avgAccuracy)
def get_precision_recall(clf, X, Y):
precisions = []
recalls = []
for i in range(10):
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.1, random_state=0)
clf.fit(trainX, trainY)
# Test precision and recall
output = clf.predict(testX)
precisions.append(precision_score(testY, output))
recalls.append(recall_score(testY, output))
avgPrecision = sum(precisions)/len(precisions)
avgRecall = sum(recalls)/len(recalls)
print "Precision for {}: {}".format(clf.__class__.__name__, avgPrecision)
print "Recall for {}: {}".format(clf.__class__.__name__, avgRecall)
def multiclass_AUC(clf, X, Y):
# Binarize the output
X, Y = np.array(X), np.array(Y)
Y = label_binarize(Y, classes=list(set(Y)))
n_classes = Y.shape[1]
# shuffle and split training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.5,
random_state=0)
# Learn to predict each class against the other
classifier = OneVsRestClassifier(clf)
Y_score = classifier.fit(X_train, Y_train).predict(X_test)
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(Y_test[:, i], Y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(Y_test.ravel(), Y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
print "AUC for multiclass {}: {}".format(clf.__class__.__name__, roc_auc["micro"])
def main():
f = FeatureGen()
# Binary classification
X, Y = f.getXY(classifier='severity')
Y = np.asarray(Y)
nfeatures = len(X[0])
max = 0
best_sub = forward_search(nfeatures, tst_multiclass_AUC, [DecisionTreeClassifier(), X, Y])
print best_sub
X = np.array(X)
X = X[:, best_sub]
multiclass_AUC(DecisionTreeClassifier(), X, Y)
cross_validate_accuracy(DecisionTreeClassifier(), X, Y)
get_precision_recall(DecisionTreeClassifier(), X, Y)
if __name__ == "__main__":
main()