一、随机森林/FM(Random Forst)
随机森林是集成学习Bagging流派中一个变体,RF在以决策树为基学习构建Bagging集成的基础上,进一步在决策树的训练过程中引入随机属性选择。传统的决策树在选择划分属性时是在当前节点的属性集合中选择最优的一个;而在RF中,对基决策树的每个结点,先从该结点的属性集合中随机选择一个包含k个属性的子集,然后再从这个子集中选择一个最优属性用于划分。
二、梯度提升决策树/GBDT(Gradient boosting decision tree)
在连载(五)中,知道提升树在每一次优化过程,都是拟合上一次的残差,在GBDT中提出用损失函数的负梯度作为回归问题中的残差近似值。
为什么GBDT中可以用损失函数的负梯度来代替上一步的残差?
设在t次的loss函数为,该损失函数在泰勒展开:
,其中为第t拟合的学习器。
由上式可知,要使得,可以令,于是可得到GBDT拟合上一步损失函数的负梯度。
算法步骤:
将连载(五)中的计算残差替换成计算上一步的损失函数的负梯度即可。
代码实现:
import numpy as np
import math
# 计算信息熵
def calculate_entropy(y):
log2 = math.log2
unique_labels = np.unique(y)
entropy = 0
for label in unique_labels:
count = len(y[y == label])
p = count / len(y)
entropy += -p * log2(p)
return entropy
# 定义树的节点
class DecisionNode():
def __init__(self, feature_i=None, threshold=None,
value=None, true_branch=None, false_branch=None):
self.feature_i = feature_i
self.threshold = threshold
self.value = value
self.true_branch = true_branch
self.false_branch = false_branch
def divide_on_feature(X, feature_i, threshold):
split_func = None
if isinstance(threshold, int) or isinstance(threshold, float):
split_func = lambda sample: sample[feature_i] >= threshold
else:
split_func = lambda sample: sample[feature_i] == threshold
X_1 = np.array([sample for sample in X if split_func(sample)])
X_2 = np.array([sample for sample in X if not split_func(sample)])
return np.array([X_1, X_2])
# 超类
class DecisionTree(object):
def __init__(self, min_samples_split=2, min_impurity=1e-7,
max_depth=float("inf"), loss=None):
self.root = None #根节点
self.min_samples_split = min_samples_split
self.min_impurity = min_impurity
self.max_depth = max_depth
# 计算值 如果是分类问题就是信息增益,回归问题就基尼指数
self._impurity_calculation = None
self._leaf_value_calculation = None #计算叶子
self.one_dim = None
self.loss = loss
def fit(self, X, y, loss=None):
self.one_dim = len(np.shape(y)) == 1
self.root = self._build_tree(X, y)
self.loss=None
def _build_tree(self, X, y, current_depth=0):
"""
递归求解树
"""
largest_impurity = 0
best_criteria = None
best_sets = None
if len(np.shape(y)) == 1:
y = np.expand_dims(y, axis=1)
Xy = np.concatenate((X, y), axis=1)
n_samples, n_features = np.shape(X)
if n_samples >= self.min_samples_split and current_depth <= self.max_depth:
# 计算每一个特征的增益值
for feature_i in range(n_features):
feature_values = np.expand_dims(X[:, feature_i], axis=1)
unique_values = np.unique(feature_values)
for threshold in unique_values:
Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold)
if len(Xy1) > 0 and len(Xy2) > 0:
y1 = Xy1[:, n_features:]
y2 = Xy2[:, n_features:]
# 计算增益值
impurity = self._impurity_calculation(y, y1, y2)
if impurity > largest_impurity:
largest_impurity = impurity
best_criteria = {"feature_i": feature_i, "threshold": threshold}
best_sets = {
"leftX": Xy1[:, :n_features],
"lefty": Xy1[:, n_features:],
"rightX": Xy2[:, :n_features],
"righty": Xy2[:, n_features:]
}
if largest_impurity > self.min_impurity:
true_branch = self._build_tree(best_sets["leftX"], best_sets["lefty"], current_depth + 1)
false_branch = self._build_tree(best_sets["rightX"], best_sets["righty"], current_depth + 1)
return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria[
"threshold"], true_branch=true_branch, false_branch=fals