-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgrd_train_58.py
105 lines (92 loc) · 3.85 KB
/
grd_train_58.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
### Read training data from file
import pickle
from tqdm import tqdm
import numpy as np
from datetime import datetime
from sklearn.preprocessing import normalize
## Load data
train_data = {}
with open ('Reduced_features/Reduced_Features_train1.pickle','rb') as f_in:
temp_train = pickle.load(f_in)
train_data.update(temp_train)
with open ('Reduced_features/Reduced_Features_train2.pickle','rb') as f_in:
temp_train = pickle.load(f_in)
train_data.update(temp_train)
with open ('Reduced_features/Reduced_Features_train3.pickle','rb') as f_in:
temp_train = pickle.load(f_in)
train_data.update(temp_train)
with open ('Reduced_features/Reduced_Features_train4.pickle','rb') as f_in:
temp_train = pickle.load(f_in)
train_data.update(temp_train)
with open ('Reduced_features/Reduced_Features_train5.pickle','rb') as f_in:
temp_train = pickle.load(f_in)
train_data.update(temp_train)
# Transfer DateGap from second to minute
train_pair = sorted(list(train_data.keys()))
for i in tqdm(range(len(train_pair))):
train_data[train_pair[i]][9] = float(train_data[train_pair[i]][9])/86400
train_data[train_pair[i]][10] = float(train_data[train_pair[i]][10])/86400
## Create training set
# Positive pair list
# [u1,u2]
temp = []
with open('data-train-dca/train.csv','r') as f_in:
for line in f_in:
user1,user2 = line.strip().split(',')
temp.append((user1,user2))
positive_pair = set(temp) # some positive pairs do not contain html information
# but we don't need to consider this situatioin
# because our training pairs aleady eliminate those pairs
del temp
# Transfer data to numpy array/Matrix
train_set_x = np.zeros( shape = (len(train_pair),58) )
train_data_y = np.zeros( shape = (len(train_pair),1) )
print("Train Matrix x's Size = {}".format(train_set_x.shape))
print("Train Matrix y's Size = {}".format(train_data_y .shape))
for i in tqdm(range(len(train_pair))):
temp = list(map(float,train_data[train_pair[i]]))
for j in range(58):
train_set_x[i][j] = temp[j]
if train_pair[i] in positive_pair:
train_data_y[i] = 1
else:
train_data_y[i] = 0
del train_data
train_set_x[np.isnan(train_set_x)] = 0
train_data_x = normalize(train_set_x)
del train_set_x
### Train Classifier
from sklearn.ensemble import (RandomForestClassifier,GradientBoostingClassifier)
from sklearn.svm import SVC
from sklearn.preprocessing import normalize
grd = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=10,verbose = 1)
print(datetime.today().strftime("%m/%d %H:%M:%S grd start"))
grd.fit(train_data_x,train_data_y.ravel())
print(datetime.today().strftime("%m/%d %H:%M:%S grd end"))
# save the classifier
with open('grd_classifier_58.pkl', 'wb') as fid:
pickle.dump(grd, fid)
# Characteristic of Classifier
print('Feature importance report from trained calssifier :')
print(grd.feature_importances_)
y_predict = grd.predict(train_data_x)
cont = 0
for i in tqdm(range(len(y_predict))):
if y_predict[i] == train_data_y.ravel()[i]:
cont += 1
print('True positive rate of trained classifier (based on train set) = {}'.format(cont/len(y_predict)*100))
# svm = SVC(C = 1.0) # kernel = ‘rbf’/(‘linear’, ‘poly’, ‘sigmoid’),gamma =
# print(datetime.today().strftime("%m/%d %H:%M:%S SVM start"))
# svm.fit(train_data_x,train_data_y.ravel())
# print(datetime.today().strftime("%m/%d %H:%M:%S SVM end"))
# # save the classifier
# with open('svm_classifier_58.pkl', 'wb') as fid:
# pickle.dump(svm, fid, protocol = 2)
#
# y_predict = svm.predict(train_data_x)
# cont = 0
# for i in tqdm(range(len(y_predict))):
# if y_predict[i] == train_data_y.ravel()[i]:
# cont += 1
# print('True positive rate of trained classifier (based on train set) = {}'.format(cont/len(y_predict)*100))
# rf = RandomForestClassifier(max_depth=3, n_estimators=100, verbose = 1)