随机森林由多棵决策树构成, 可以对样本集合进行多次有放回的采样, 并构建相应的决策树, 对于预测样本, 遍历每一刻决策树, 记录相应结果, 采取相应的总结方法, 总结结果作为随机森林的输出
import numpy as np
from .DT import DecisionTree
def bootstrap_sample(X, Y):
N, M = X.shape
idxs = np.random.choice(N, N, replace=True)
return X[idxs], Y[idxs]
class RandomForest:
def __init__(self, n_trees, max_depth, n_feats, classifier=True, criterion="entropy"):
self.trees = []
self.n_trees = n_trees
self.n_feats = n_feats
self.max_depth = max_depth
self.criterion = criterion
self.classifier = classifier
def fit(self, X, Y):
self.trees = []
for _ in range(self.n_trees):
X_samp, Y_samp = bootstrap_sample(X, Y)
tree = DecisionTree(n_feats=self.n_feats,
max_depth=self.max_depth,
criterion=self.criterion,
classifier=self.classifier)
tree.fit(X_samp, Y_samp)
self.trees.append(tree)
def predict(self, X):
tree_preds = np.array([[t._traverse(x, t.root) for x in X] for t in self.trees])
return self._vote(tree_preds)
def _vote(self, predictions):
if self.classifier:
out = [np.bincount(x).argmax() for x in predictions.T]
else:
out = [np.mean(x) for x in predictions.T]
return np.array(out)