-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrandom_forests.py
157 lines (125 loc) · 6.04 KB
/
random_forests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import numpy as np
from itertools import product
from copy import deepcopy
from scipy.stats import mode
from decision_trees import ClassificationTree, RegressionTree
import matplotlib.pyplot as plt
class RandomForest():
'''
Random Forest Regressor/Classifier class
:method fit: fitting model
'''
def __init__(self, n_estimators=10, criterion='gini', estimator='mode', \
min_samples_split=5, min_samples_leaf=1, max_features='auto', max_depth=3):
'''
Initialize random forest class
:param n_estimators: number of estimators (i.e. number of decision trees)
:param max_features: maximum features to consider at each split
'''
estimator_criterion_dict = {
'mean': ['mse', 'mae'],
'median': ['mse', 'mae'],
'mode': ['gini', 'entropy']
}
assert estimator in estimator_criterion_dict.keys(), \
'Param "estimator" must be one of {}'.format(estimator_criterion_dict.keys())
assert criterion in estimator_criterion_dict[estimator], \
'Param "criterion" must be one of {} for leaf value estimator "{}"'\
.format(estimator_criterion_dict[estimator], estimator)
valid_max_features = ['auto', 'sqrt', None]
assert isinstance(max_features, int) or max_features in valid_max_features, \
'Param "max_features" must either be an integer or one of "{}"'.format(valid_max_features)
self.n_estimators = n_estimators
self.criterion = criterion
self.min_samples_split = min_samples_split
self.min_samples_leaf = min_samples_leaf
self.max_features = max_features
self.max_depth = max_depth
if estimator in ['mean', 'median']:
base_estimator = RegressionTree(criterion=criterion, estimator=estimator, max_depth=max_depth, \
min_samples_split=min_samples_split, max_features=max_features, \
min_samples_leaf=min_samples_leaf)
else:
base_estimator = ClassificationTree(criterion=criterion, max_depth=max_depth, \
min_samples_split=min_samples_split, \
max_features=max_features, \
min_samples_leaf=min_samples_leaf)
self.estimator = estimator
self.estimators = [deepcopy(base_estimator) for _ in range(n_estimators)]
def get_params(self, deep=True):
return self.estimators[0].get_params(deep)
def fit(self, X, y):
'''
Fit gradient boosting model
'''
for i in range(self.n_estimators):
self.estimators[i].fit(X, y)
return self
def predict(self, X):
'''
Predict value
'''
# Node value prediction function
leaf_value_estimator_dict = {
'mean': np.mean,
'median': np.median,
'mode': mode,
}
estimator_fn = leaf_value_estimator_dict[self.estimator]
predictions = []
for i in range(self.n_estimators):
predictions.append(self.estimators[i].predict(X))
predictions = estimator_fn(np.array(predictions), axis=0)
if self.estimator == 'mode':
return predictions[0][0]
return predictions
def main():
np.random.seed(0)
############### Classifiers ###############
data_train = np.loadtxt('data/svm-train.txt')
x_train, y_train = data_train[:, 0:2], data_train[:, 2].reshape(-1, 1)
y_train_label = (y_train > 0).astype(int).reshape(-1, 1)
# Plotting decision regions
x_min, x_max = x_train[:, 0].min() - 1, x_train[:, 0].max() + 1
y_min, y_max = x_train[:, 1].min() - 1, x_train[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
np.arange(y_min, y_max, 0.1))
f, axarr = plt.subplots(2, 3, sharex='col', sharey='row', figsize=(10, 8))
for idx, n_est, tt in zip(product([0, 1], [0, 1, 2]),
[1, 5, 10, 20, 50, 100],
['n_estimators = {}'.format(n) for n in [1, 5, 10, 20, 50, 100]]):
# Random Forest Classifier
rf = RandomForest(n_estimators=n_est, max_features='auto', \
criterion='entropy', estimator='mode', max_depth=5)
rf.fit(x_train, y_train.ravel())
Z = (rf.predict(np.c_[xx.ravel(), yy.ravel()]) > 0).astype(int)
Z = Z.reshape(xx.shape)
f.suptitle('Random Forest Classifier')
axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.4)
axarr[idx[0], idx[1]].scatter(x_train[:, 0], x_train[:, 1], c=y_train_label.ravel(), alpha=0.8)
axarr[idx[0], idx[1]].set_title(tt)
plt.show()
#######################################################
############### Regressors ###############
data_krr_train = np.loadtxt('data/krr-train.txt')
x_krr_train, y_krr_train = data_krr_train[:,0].reshape(-1,1), data_krr_train[:,1].reshape(-1,1)
plot_size = 0.001
x_range = np.arange(0., 1., plot_size).reshape(-1, 1)
f, axarr = plt.subplots(2, 3, sharex='col', sharey='row', figsize=(10, 8))
for idx, n_est, tt in zip(product([0, 1], [0, 1, 2]),
[1, 5, 10, 20, 50, 100],
['n_estimators = {}'.format(n) for n in [1, 5, 10, 20, 50, 100]]):
# Random Forest Regressor
rf = RandomForest(n_estimators=n_est, max_features='auto', \
criterion='mse', estimator='mean', max_depth=2)
rf.fit(x_krr_train, y_krr_train.ravel())
y_predict = rf.predict(x_range)
f.suptitle('Random Forest Regressor')
axarr[idx[0], idx[1]].plot(x_range, y_predict, color='r')
axarr[idx[0], idx[1]].scatter(x_krr_train, y_krr_train.ravel(), alpha=0.8)
axarr[idx[0], idx[1]].set_title(tt)
axarr[idx[0], idx[1]].set_xlim(0, 1)
plt.show()
#######################################################
if __name__ == '__main__':
main()