链接地址:Part D: 梯度提升树 — Datawhale
一、练习题
1. 练习题01
解答:
(1)
(2) 因为模型希望每次拟合值较小而引入了 ,这是一个二次函数,因为w是小量,展开到3阶以上没有意义。
(3)
所以
2 练习题02
解答:
(1)Root Absolute Error
二次导数q_i<0,不能做损失函数
(2) Squared Log Error:
随样本变化不恒大于0,不能做损失函数。
(3) Pseudo Huber Error:
所以可以做损失函数。
3. 练习提03
(1) 解答:
最好2种颜色,最差d种颜色
(2) 解答:
最差情况就是每个特征都是一族互斥特征,需要所有特征两两都互斥。
二、代码实现
1. xgboost代码实现(来自GYH老师)
# BasicTree.py
import numpy as np
def MSE(y):
return ((y - y.mean())**2).sum() / y.shape[0]
class Node:
def __init__(self, depth, idx):
self.depth = depth
self.idx = idx
self.left = None
self.right = None
self.feature = None
self.pivot = None
class Tree:
def __init__(self, max_depth, lamda, gamma):
self.max_depth = max_depth
self.lamda=lamda
self.gamma=gamma
self.X = None
self.y = None
self.feature_importances_ = None
def _able_to_split(self,node):
return (node.depth < self.max_depth) & (node.idx.sum() >= 2)
def _get_inner_split_score(self,to_left,to_right):
return self._get_score(to_left)+self._get_score(to_right)
def _inner_split(self,col,idx):
data=self.X[:,col]
best_val=-np.infty
for pivot in data[:-1]:
to_left=(idx==1) & (data<=pivot)
to_right=(idx==1) &(~to_left)
if to_left.sum()==0 or to_left.sum()==idx.sum():
continue
Hyx=self._get_inner_split_score(to_left, to_right)
if best_val<Hyx:
best_val,best_pivot=Hyx,pivot
best_to_left,best_to_right=to_left,to_right
return best_val, best_to_left, best_to_right, best_pivot
def _get_leaf_score(self,idx):
best_val=-np.infty
for col in range(self.X.shape[1]):
Hyx, _idx_left, _idx_right, pivot=self._inner_split(col, idx)
if best_val<Hyx:
best_val, idx_left, idx_right=Hyx, _idx_left, _idx_right
best_feature, best_pivot=col, pivot
best_feature, best_pivot=col,pivot
return best_val, idx_left, idx_right, best_feature, best_pivot
def _get_score(self,idx):
return self.p[idx].sum()**2/(self.q[idx].sum() + self.lamda)
def split(self,node):
if not self._able_to_split(node):
return None, None, None, None
node_score =self._get_score(node.idx)
(
leaf_score,
idx_left,
idx_right,
feature,
pivot
) = self._get_leaf_score(node.idx)
gain = (leaf_score - node_score)/2 - self.gamma
relative_gain=node.idx.sum()/self.X.shape[0]*gain
self.feature_importances_[feature]+=relative_gain
node.left=Node(node.depth+1,idx_left)
node.right=Node(node.depth+1,idx_right)
self.depth=max(node.depth+1,self.depth)
return idx_left, idx_right, feature, pivot
def build_prepare(self):
self.depth=0
self.feature_importances_ = np.zeros(self.X.shape[1])
self.root=Node(depth=0,idx=np.ones(self.X.shape[0])==1)
def build_node(self, cur_node):
if cur_node is None:
return
idx_left, idx_right, feature, pivot = self.split(cur_node)
cur_node.feature, cur_node.pivot = feature, pivot
self.build_node(cur_node.left)
self.build_node(cur_node.right)
def build(self):
self.build_prepare()
self.build_node(self.root)
def _search_prediction(self, node, x):
if node.left is None and node.right is None:
return self.y[node.idx].mean()
if x[node.feature]<=node.pivot:
node = node.left
else:
node = node.right
return self._search_prediction(node, x)
def predict(self, x):
return self._search_prediction(self.root, x)
class DecisionTreeRegressor:
def __init__(self, max_depth, lamda, gamma):
self.tree = Tree(max_depth, lamda, gamma)
def fit(self, X, y, p, q):
self.tree.X = X
self.tree.y = y
self.tree.p = p
self.tree.q = q
self.tree.build()
self.feature_importances_=(
self.tree.feature_importances_/self.tree.feature_importances_.sum()
)
return self
def predict(self, X):
return np.array([self.tree.predict(x) for x in X])
from BasicTree import DecisionTreeRegressor as DT
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
class Myboost:
def __init__(self, max_depth=4, n_estimator=1000, lamda=1, gamma=0, lr=0.2):
self.max_depth = max_depth
self.n_estimator = n_estimator
self.lamda = lamda
self.gamma = gamma
self.lr = lr
self.booster = []
self.feature_importances_ = 0
self.best_round = None
def record_score(self, y_train, y_val, train_predict, val_predict, i):
mse_val = mean_squared_error(y_val, val_predict)
mse_train = mean_squared_error(y_train, train_predict)
print("第%d轮\t训练集: %.4f\t"
"验证集: %.4f"%(i+1, mse_train, mse_val))
return mse_val
def fit(self, X, y):
# 在数据集中划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(
X, y, test_size=0.25, random_state=0)
train_predict, val_predict = 0, 0
p=np.full(X_train.shape[0], np.mean(y_train))
q=np.ones(X_train.shape[0])
last_val_score = np.infty
for i in range(self.n_estimator):
cur_booster = DT(self.max_depth, self.lamda, self.gamma)
cur_booster.fit(X_train, y_train, p, q)
self.feature_importances_+=cur_booster.feature_importances_
train_predict += cur_booster.predict(X_train)*self.lr
val_predict += cur_booster.predict(X_val)*self.lr
p = -(y_train - train_predict)
self.booster.append(cur_booster)
cur_val_score = self.record_score(y_train, y_val, train_predict, val_predict, i)
if cur_val_score > last_val_score:
self.best_round = i
print("\n训练结束!最佳轮数为%d" %(i+1))
break
last_val_score = cur_val_score
def predict(self, X):
cur_predict = 0
# 在最佳验证集得分的轮数停止,防止过拟合
for i in range(self.best_round):
cur_predict += self.lr * self.booster[i].predict(X)
return cur_predict
if __name__ == "__main__":
X, y = make_regression(
n_samples=400, n_features=8, n_informative=4, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25, random_state=0)
model = Myboost()
model.fit(X_train, y_train)
prediction = model.predict(X_test)
mse = mean_squared_error(y_test, prediction)
print("\n测试集的MSE为 %.4f"%(mse))
2. xgboost代码实现
https://github.com/RRdmlearning/Machine-Learning-From-Scratch/tree/master/xgboost
class XGBoostRegressionTree(DecisionTree):
"""
Regression tree for XGBoost
- Reference -
http://xgboost.readthedocs.io/en/latest/model.html
"""
def _split(self, y):
""" y contains y_true in left half of the middle column and
y_pred in the right half. Split and return the two matrices """
col = int(np.shape(y)[1]/2)
y, y_pred = y[:, :col], y[:, col:]
return y, y_pred
def _gain(self, y, y_pred):
nominator = np.power((self.loss.gradient(y, y_pred)).sum(), 2)
denominator = self.loss.hess(y, y_pred).sum()
return 0.5 * (nominator / denominator)
def _gain_by_taylor(self, y, y1, y2):
# Split
y, y_pred = self._split(y)
y1, y1_pred = self._split(y1)
y2, y2_pred = self._split(y2)
true_gain = self._gain(y1, y1_pred)
false_gain = self._gain(y2, y2_pred)
gain = self._gain(y, y_pred)
return true_gain + false_gain - gain
def _approximate_update(self, y):
# y split into y, y_pred
y, y_pred = self._split(y)
gradient = np.sum(self.loss.gradient(y, y_pred),axis=0)
hessian = np.sum(self.loss.hess(y, y_pred), axis=0)
update_approximation = gradient / hessian
return update_approximation
def fit(self, X, y):
self._impurity_calculation = self._gain_by_taylor
self._leaf_value_calculation = self._approximate_update
super(XGBoostRegressionTree, self).fit(X, y)
class XGBoost(object):
"""The XGBoost classifier.
Reference: http://xgboost.readthedocs.io/en/latest/model.html
n_estimators: int
树的数量
The number of classification trees that are used.
learning_rate: float
梯度下降的学习率
The step length that will be taken when following the negative gradient during
training.
min_samples_split: int
每棵子树的节点的最小数目(小于后不继续切割)
The minimum number of samples needed to make a split when building a tree.
min_impurity: float
每颗子树的最小纯度(小于后不继续切割)
The minimum impurity required to split the tree further.
max_depth: int
每颗子树的最大层数(大于后不继续切割)
"""
def __init__(self, n_estimators=200, learning_rate=0.01, min_samples_split=2,
min_impurity=1e-7, max_depth=2):
self.n_estimators = n_estimators # Number of trees
self.learning_rate = learning_rate # Step size for weight update
self.min_samples_split = min_samples_split # The minimum n of sampels to justify split
self.min_impurity = min_impurity # Minimum variance reduction to continue
self.max_depth = max_depth # Maximum depth for tree
self.bar = progressbar.ProgressBar(widgets=bar_widgets)
# Log loss for classification
self.loss = LeastSquaresLoss()
# Initialize regression trees
self.trees = []
for _ in range(n_estimators):
tree = XGBoostRegressionTree(
min_samples_split=self.min_samples_split,
min_impurity=min_impurity,
max_depth=self.max_depth,
loss=self.loss)
self.trees.append(tree)
def fit(self, X, y):
# y = to_categorical(y)
m = X.shape[0]
y = np.reshape(y, (m, -1))
y_pred = np.zeros(np.shape(y))
for i in self.bar(range(self.n_estimators)):
tree = self.trees[i]
y_and_pred = np.concatenate((y, y_pred), axis=1)
tree.fit(X, y_and_pred)
update_pred = tree.predict(X)
update_pred = np.reshape(update_pred, (m, -1))
y_pred += update_pred
def predict(self, X):
y_pred = None
m = X.shape[0]
# Make predictions
for tree in self.trees:
# Estimate gradient and update prediction
update_pred = tree.predict(X)
update_pred = np.reshape(update_pred, (m, -1))
if y_pred is None:
y_pred = np.zeros_like(update_pred)
y_pred += update_pred
return y_pred
3. LightGBM代码实现