forked from eriklindernoren/ML-From-Scratch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrandom_forest.py
140 lines (120 loc) · 5.3 KB
/
random_forest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from __future__ import division, print_function
import numpy as np
from sklearn import datasets
import sys
import os
import math
import progressbar
# Import helper functions
from mlfromscratch.utils.data_manipulation import divide_on_feature, train_test_split, get_random_subsets, normalize
from mlfromscratch.utils.data_operation import accuracy_score, calculate_entropy
from mlfromscratch.unsupervised_learning import PCA
from mlfromscratch.supervised_learning import ClassificationTree
bar_widgets = [
'Training: ', progressbar.Percentage(), ' ', progressbar.Bar(marker="-", left="[", right="]"),
' ', progressbar.ETA()
]
class RandomForest():
"""Random Forest classifier. Uses a collection of classification trees that
trains on random subsets of the data using a random subsets of the features.
Parameters:
-----------
n_estimators: int
The number of classification trees that are used.
max_features: int
The maximum number of features that the classification trees are allowed to
use.
min_samples_split: int
The minimum number of samples needed to make a split when building a tree.
min_gain: float
The minimum impurity required to split the tree further.
max_depth: int
The maximum depth of a tree.
debug: boolean
True or false depending on if we wish to display the training errors.
"""
def __init__(self, n_estimators=100, max_features=None, min_samples_split=2,
min_gain=1e-7, max_depth=float("inf"), debug=False):
self.n_estimators = n_estimators # Number of trees
self.max_features = max_features # Maxmimum number of features per tree
self.feature_indices = [] # The indices of the features used for each tree
self.min_samples_split = min_samples_split
self.min_gain = min_gain # Minimum information gain req. to continue
self.max_depth = max_depth # Maximum depth for tree
self.debug = debug
self.bar = progressbar.ProgressBar(widgets=bar_widgets)
# Initialize decision trees
self.trees = []
for _ in range(n_estimators):
self.trees.append(
ClassificationTree(
min_samples_split=self.min_samples_split,
min_impurity=min_gain,
max_depth=self.max_depth))
def fit(self, X, y):
n_features = np.shape(X)[1]
# If max_features have not been defined => select it as
# sqrt(n_features)
if not self.max_features:
self.max_features = int(math.sqrt(n_features))
if self.debug:
print ("Training (%s estimators):" % (self.n_estimators))
# Choose one random subset of the data for each tree
subsets = get_random_subsets(X, y, self.n_estimators)
for i in self.bar(range(self.n_estimators)):
X_subset, y_subset = subsets[i]
# Feature bagging (select random subsets of the features)
idx = np.random.choice(range(n_features), size=self.max_features, replace=True)
# Save the indices of the features for prediction
self.feature_indices.append(idx)
# Choose the features corresponding to the indices
X_subset = X_subset[:, idx]
# Fit the tree to the data
self.trees[i].fit(X_subset, y_subset)
if self.debug:
progress = 100 * (i / self.n_estimators)
print ("Progress: %.2f%%" % progress)
def predict(self, X):
y_preds = []
# Let each tree make a prediction on the data
for i, tree in enumerate(self.trees):
# Select the features that the tree has trained on
idx = self.feature_indices[i]
# Make a prediction based on those features
prediction = tree.predict(X[:, idx])
y_preds.append(prediction)
# Take the transpose of the matrix to transform it so
# that rows are samples and columns are predictions by the
# estimators
y_preds = np.array(y_preds).T
y_pred = []
# For each sample
for sample_predictions in y_preds:
# Do a majority vote over the predictions (columns)
max_count = 0
most_common = None
# For each unique predicted label -> count occurences
# and save the most predicted label
for label in np.unique(sample_predictions):
count = len(sample_predictions[sample_predictions == label])
if count > max_count:
max_count = count
most_common = label
# The most common prediction gets added as final prediction
# of the sample
y_pred.append(most_common)
return y_pred
def main():
data = datasets.load_iris()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, seed=2)
clf = RandomForest(debug=True)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print ("Accuracy:", accuracy)
pca = PCA()
pca.plot_in_2d(X_test, y_pred, title="Random Forest", accuracy=accuracy, legend_labels=data.target_names)
if __name__ == "__main__":
main()