-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
206 lines (142 loc) · 6.9 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#!/usr/bin/env python
# coding: utf-8
##############################
# Importing Python libraries #
##############################
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
######################
# Importing datasets #
######################
data=pd.read_csv('data.csv') # dataset with objects (samples) and relative features (genes)
labels=pd.read_csv('labels.csv') # dataset with objects (samples) and relative labels (tumor class)
####################
# Data exploration #
####################
# display first 5 rows of each dataset
data.head(5)
labels.head(5)
# display total number of rows and columns in each dataset
data.shape
labels.shape
# statistical description of the dataset with objects and relative features
data.describe()
# understand further the variabale to predict
labels['Class'].value_counts()
# Out of 801 samples, 300 are BRCA tumors, 146 are KIRC tumors, 141 are LUAD tumors, 136 are PRAD tumors and 78 are COAD tumors.
######################
# Data preprocessing #
######################
# store the feature sets into the X variable and the series of corresponding labels into the Y variable
X=data.iloc[:,1:]
Y=labels.iloc[:,1]
### Missing value ratio
# checking and saving missing values in a variable
missing_values = data.isnull().sum()/len(data)*100
# saving column names into a variable
variables = data.columns
variable = [ ]
for i in range(0, 20532):
if a[i]<=20: # setting the threshold as 20%
variable.append(variables[i])
### Converting categorical data
# Categorical data are variables that contain label values rather than numeric values.The number of possible values is often limited to a fixed set. Here, we use LabelEncoder to label the categorical data. LabelEncoder is the part of SciKitLearn library in Python and used to convert categorical data, or text data, into numbers, which the predictive models can better understand.
# convert categorical data (BRCA, KIRC, LUAD, PRAD, COAD) into numbers
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)
### Splitting the datasets
# The data is usually split into training data and test data. The training set contains a known output and the model learns on this data in order to be generalized to other data later on. The test dataset is employed to test the model’s prediction on this subset. We will do this using SciKit-Learn library in Python using the train_test_split method.
# split the dataset into 70% train and 30% test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=0)
### Feature scaling
# PCA performs best with a normalized feature set. We will perform standard scalar normalization to normalize the feature set. We will use StandardScaler method from SciKit-Learn Python library.
# standard scalar normalization to normalize the features (PCA performs best with normalized features)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
### Dimensionality reduction using PCA
# Principal component analysis (PCA), is a statistical technique to convert high dimensional data to low dimensional data by selecting the most important features that capture maximum information about the dataset. PCA depends only upon the feature set and not the label data. Therefore, PCA can be considered as an unsupervised machine learning technique. Performing PCA using Scikit-Learn is a three-step process.
# initialize the PCA class
pca = PCA(n_components=1)
# call the fit method by passing the feature set
X_train = pca.fit_transform(X_train)
# call the transform method by passing the feature set
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
###################################
# Training and making predictions #
###################################
### Logistic regression
classifier_1= LogisticRegression(C=1e70)
# train the model
classifier_1.fit(X_train, Y_train)
# test the model using the test dataset
Y_pred1 = classifier_1.predict(X_test)
### Nearest neighbor
classifier_2 = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier_2.fit(X_train, Y_train)
Y_pred2 = classifier_2.predict(X_test)
### SVM
classifier_3 = SVC(kernel = 'linear', random_state = 0)
classifier_3.fit(X_train, Y_train)
Y_pred3 = classifier_3.predict(X_test)
### Kernel SVM
classifier_4 = SVC(kernel = 'rbf', random_state = 0)
classifier_4.fit(X_train, Y_train)
Y_pred4 = classifier_4.predict(X_test)
### Naive Bayes
classifier_5 = GaussianNB()
classifier_5.fit(X_train, Y_train)
Y_pred5 = classifier_5.predict(X_test)
### Decision tree algorithm
classifier_6 = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier_6.fit(X_train, Y_train)
Y_pred6 = classifier_6.predict(X_test)
### Random forest classification
classifier_7 = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier_7.fit(X_train, Y_train)
Y_pred7 = classifier_7.predict(X_test)
################################
# Model performance evaluation #
################################
### Logistic regression
print (accuracy_score(Y_test, Y_pred1))
confusion_matrix = pd.crosstab(Y_test, Y_pred1, rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)
### Nearest neighbor
print (accuracy_score(Y_test, Y_pred2))
confusion_matrix = pd.crosstab(Y_test, Y_pred2, rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)
### SVM
print (accuracy_score(Y_test, Y_pred3))
confusion_matrix = pd.crosstab(Y_test, Y_pred3, rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)
### Kernel SVM
print (accuracy_score(Y_test, Y_pred4))
confusion_matrix = pd.crosstab(Y_test, Y_pred4, rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)
### Naive Bayes
print (accuracy_score(Y_test, Y_pred5))
confusion_matrix = pd.crosstab(Y_test, Y_pred5, rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)
### Decision tree algorithm
print (accuracy_score(Y_test, Y_pred6))
confusion_matrix = pd.crosstab(Y_test, Y_pred6, rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)
### Random forest classification
print (accuracy_score(Y_test, Y_pred7))
confusion_matrix = pd.crosstab(Y_test, Y_pred7, rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)