DataWhale-树模型与集成学习-Task06-梯度提升树02-202110

链接地址:Part D: 梯度提升树 — Datawhale

一、练习题

1. 练习题01

 解答:

(1)\begin{equation} \begin{aligned} L^{(m)}(F)&=\gamma T+\frac{1}{2}\sum_{j=1}^{T}(F_j^{(m)}-F_j^{(m-1)})^2+\sum_{i=1}^{N}L(y_i,F_j^{(m)})\\ &=\gamma T+\frac{1}{2}\sum_{j=1}^{T}(F_j^{(m)}-F_j^{(m-1)})^2+\sum_{i=1}^{N}[\frac{\partial L}{\partial F_i^{(m)}}|_{F_i^{(m)}=F_i^{(m-1)}}(F_i^{(m)}-F_i^{(m-1)})+\frac{1}{2}\frac{\partial^2L}{\partial F_j^{(m)2}}]|_{F_i^{(m)}=F_i^{(m-1)}}(F_i^{(m)}-F_i^{(m-1)})^2 \end{aligned} \end{equation}

(2)  因为模型希望每次拟合值较小而引入了\frac{1}{2}\lambda\sum_{j=1}^{T}w_j^2 ,这是一个二次函数,因为w是小量,展开到3阶以上没有意义。

(3) 

                                       L=\frac{1}{2}\sum_{i=1}^{N}(y_i-F_i^{(m-1)}-h_i)^2

                                      p_i=\frac{\partial L}{\partial h_i}|_{h_i=0}=F_i^{(m-1)}-y_i

                                       q_i=\frac{\partial^2 L}{\partial h_i^2}|_{h_i=0}=1

所以

                        L^{(m)}=\gamma T+\sum_{j=1}^{T}[[\sum_{i\in I_j}(F_i^{(m-1)}-y_i)]w_j+\frac{1}{2}(\sum_{i\in I_j}1+\lambda)w_j^2]

2 练习题02

 解答:

(1)Root Absolute Error 

     q_i=-\frac{3}{4}|\hat{y}-y|^{-3/2}sign^{2}(\hat{y}-y)<0

  二次导数q_i<0,不能做损失函数

 (2)  Squared Log Error:

              q_i=\frac{1}{(\hat{y}+1)^2}[1-\log(\frac{\hat{y}+1}{y+1})]

随样本变化不恒大于0,不能做损失函数。

(3)  Pseudo Huber Error:

q_i=[1+\frac{(\hat{y}-y)^2}{\delta^2}]^{-3/2}>0

所以可以做损失函数。

3. 练习提03

 (1) 解答:

最好2种颜色,最差d种颜色

(2) 解答:

最差情况就是每个特征都是一族互斥特征,需要所有特征两两都互斥。

二、代码实现

1.  xgboost代码实现(来自GYH老师)

# BasicTree.py

import numpy as np


def MSE(y):
    return ((y - y.mean())**2).sum() / y.shape[0]

class Node:

    def __init__(self, depth, idx):
        self.depth = depth
        self.idx = idx

        self.left = None
        self.right = None
        self.feature = None
        self.pivot = None
        
class Tree:

    def __init__(self, max_depth, lamda, gamma):
        self.max_depth = max_depth
        self.lamda=lamda
        self.gamma=gamma

        self.X = None
        self.y = None
        self.feature_importances_ = None
    
    def _able_to_split(self,node):
        return (node.depth < self.max_depth) & (node.idx.sum() >= 2)
    
    def _get_inner_split_score(self,to_left,to_right):
        return self._get_score(to_left)+self._get_score(to_right)
    
    def _inner_split(self,col,idx):
        data=self.X[:,col]
        best_val=-np.infty
        for pivot in data[:-1]:
            to_left=(idx==1) & (data<=pivot)
            to_right=(idx==1) &(~to_left)
            if to_left.sum()==0 or to_left.sum()==idx.sum():
                continue
            Hyx=self._get_inner_split_score(to_left, to_right)
            if best_val<Hyx:
                best_val,best_pivot=Hyx,pivot
                best_to_left,best_to_right=to_left,to_right
        return best_val, best_to_left, best_to_right, best_pivot
    
    def _get_leaf_score(self,idx):
        best_val=-np.infty
        for col in range(self.X.shape[1]):
            Hyx, _idx_left, _idx_right, pivot=self._inner_split(col, idx)
            if best_val<Hyx:
                best_val, idx_left, idx_right=Hyx, _idx_left, _idx_right
                best_feature, best_pivot=col, pivot
                best_feature, best_pivot=col,pivot
        return best_val, idx_left, idx_right, best_feature, best_pivot
    
    def _get_score(self,idx):
        return self.p[idx].sum()**2/(self.q[idx].sum() + self.lamda)
    
    
    def split(self,node):
        if not self._able_to_split(node):
            return None, None, None, None
        node_score =self._get_score(node.idx)
        (
            leaf_score,
            idx_left,
            idx_right,
            feature,
            pivot
        ) = self._get_leaf_score(node.idx)
        gain = (leaf_score - node_score)/2 - self.gamma
        relative_gain=node.idx.sum()/self.X.shape[0]*gain
        self.feature_importances_[feature]+=relative_gain
        node.left=Node(node.depth+1,idx_left)
        node.right=Node(node.depth+1,idx_right)
        self.depth=max(node.depth+1,self.depth)
        return idx_left, idx_right, feature, pivot
    
    def build_prepare(self):
        self.depth=0
        self.feature_importances_ = np.zeros(self.X.shape[1])
        self.root=Node(depth=0,idx=np.ones(self.X.shape[0])==1)
        

    def build_node(self, cur_node):
        if cur_node is None:
            return
        idx_left, idx_right, feature, pivot = self.split(cur_node)
        cur_node.feature, cur_node.pivot = feature, pivot
        self.build_node(cur_node.left)
        self.build_node(cur_node.right)    
        
    def build(self):
        self.build_prepare()
        self.build_node(self.root)    
        
    
    def _search_prediction(self, node, x):
        if node.left is None and node.right is None:
            return self.y[node.idx].mean()
        if x[node.feature]<=node.pivot:
            node = node.left
        else:
            node = node.right
        return self._search_prediction(node, x)

    def predict(self, x):
        return self._search_prediction(self.root, x)
        

class DecisionTreeRegressor:
    
    def __init__(self, max_depth, lamda, gamma):
        self.tree = Tree(max_depth, lamda, gamma)
    
    def fit(self, X, y, p, q):
        self.tree.X = X
        self.tree.y = y
        self.tree.p = p
        self.tree.q = q
        self.tree.build()
        self.feature_importances_=(
            self.tree.feature_importances_/self.tree.feature_importances_.sum()    
        
        )
        return self

    def predict(self, X):
        return np.array([self.tree.predict(x) for x in X])
from BasicTree import DecisionTreeRegressor as DT
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

class Myboost:
    
    def __init__(self, max_depth=4, n_estimator=1000, lamda=1, gamma=0, lr=0.2):
        self.max_depth = max_depth
        self.n_estimator = n_estimator
        self.lamda = lamda
        self.gamma = gamma
        self.lr = lr
        self.booster = []
        self.feature_importances_ = 0
        self.best_round = None
        
        
    def record_score(self, y_train, y_val, train_predict, val_predict, i):
        mse_val =  mean_squared_error(y_val, val_predict)
        mse_train = mean_squared_error(y_train, train_predict)
        print("第%d轮\t训练集: %.4f\t" 
              "验证集: %.4f"%(i+1, mse_train, mse_val))
        return mse_val
    
    def fit(self, X, y):
        # 在数据集中划分训练集和验证集
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.25, random_state=0)
        train_predict, val_predict = 0, 0
        p=np.full(X_train.shape[0], np.mean(y_train))
        q=np.ones(X_train.shape[0])
        last_val_score = np.infty
        for i in range(self.n_estimator):
            cur_booster = DT(self.max_depth, self.lamda, self.gamma)
            cur_booster.fit(X_train, y_train, p, q)
            self.feature_importances_+=cur_booster.feature_importances_
            train_predict += cur_booster.predict(X_train)*self.lr
            val_predict += cur_booster.predict(X_val)*self.lr
            p = -(y_train - train_predict)
            self.booster.append(cur_booster)
            cur_val_score = self.record_score(y_train, y_val, train_predict, val_predict, i)
            if cur_val_score > last_val_score:
                self.best_round = i
                print("\n训练结束!最佳轮数为%d" %(i+1))
                break
            last_val_score = cur_val_score
            
    def predict(self, X):
        cur_predict = 0
        # 在最佳验证集得分的轮数停止,防止过拟合
        for i in range(self.best_round):
            cur_predict += self.lr * self.booster[i].predict(X)
        return cur_predict


if __name__ == "__main__":

    X, y = make_regression(
        n_samples=400, n_features=8, n_informative=4, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=0)

    model = Myboost()
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    mse = mean_squared_error(y_test, prediction)
    print("\n测试集的MSE为 %.4f"%(mse))

2. xgboost代码实现

​​​​​​XGBoost的python源码实现 - 知乎

https://github.com/RRdmlearning/Machine-Learning-From-Scratch/tree/master/xgboost

class XGBoostRegressionTree(DecisionTree):
    """
    Regression tree for XGBoost
    - Reference -
    http://xgboost.readthedocs.io/en/latest/model.html
    """
 
    def _split(self, y):
        """ y contains y_true in left half of the middle column and
        y_pred in the right half. Split and return the two matrices """

        col = int(np.shape(y)[1]/2)
        y, y_pred = y[:, :col], y[:, col:]
        return y, y_pred
 
    def _gain(self, y, y_pred):
        nominator = np.power((self.loss.gradient(y, y_pred)).sum(), 2)
        denominator = self.loss.hess(y, y_pred).sum()
        return 0.5 * (nominator / denominator)
 
    def _gain_by_taylor(self, y, y1, y2):
        # Split
        y, y_pred = self._split(y)
        y1, y1_pred = self._split(y1)
        y2, y2_pred = self._split(y2)
 
        true_gain = self._gain(y1, y1_pred)
        false_gain = self._gain(y2, y2_pred)
        gain = self._gain(y, y_pred)
        return true_gain + false_gain - gain
 
    def _approximate_update(self, y):
        # y split into y, y_pred
        y, y_pred = self._split(y)
        gradient = np.sum(self.loss.gradient(y, y_pred),axis=0)
        hessian = np.sum(self.loss.hess(y, y_pred), axis=0)
        update_approximation =  gradient / hessian
        return update_approximation
 
 
    def fit(self, X, y):
        self._impurity_calculation = self._gain_by_taylor
        self._leaf_value_calculation = self._approximate_update
        super(XGBoostRegressionTree, self).fit(X, y)
class XGBoost(object):
    """The XGBoost classifier.
 
    Reference: http://xgboost.readthedocs.io/en/latest/model.html
 
    n_estimators: int
    树的数量
    The number of classification trees that are used.
learning_rate: float
    梯度下降的学习率
    The step length that will be taken when following the negative gradient during
    training.
min_samples_split: int
    每棵子树的节点的最小数目(小于后不继续切割)
    The minimum number of samples needed to make a split when building a tree.
min_impurity: float
    每颗子树的最小纯度(小于后不继续切割)
    The minimum impurity required to split the tree further.
max_depth: int
    每颗子树的最大层数(大于后不继续切割)
    """
    def __init__(self, n_estimators=200, learning_rate=0.01, min_samples_split=2,
                 min_impurity=1e-7, max_depth=2):
        self.n_estimators = n_estimators  # Number of trees
        self.learning_rate = learning_rate  # Step size for weight update
        self.min_samples_split = min_samples_split  # The minimum n of sampels to justify split
        self.min_impurity = min_impurity  # Minimum variance reduction to continue
        self.max_depth = max_depth  # Maximum depth for tree
 
        self.bar = progressbar.ProgressBar(widgets=bar_widgets)
 
        # Log loss for classification
        self.loss = LeastSquaresLoss()
 
        # Initialize regression trees
        self.trees = []
        for _ in range(n_estimators):
            tree = XGBoostRegressionTree(
                min_samples_split=self.min_samples_split,
                min_impurity=min_impurity,
                max_depth=self.max_depth,
                loss=self.loss)
 
            self.trees.append(tree)
 
    def fit(self, X, y):
        # y = to_categorical(y)
        m = X.shape[0]
        y = np.reshape(y, (m, -1))
        y_pred = np.zeros(np.shape(y))
        for i in self.bar(range(self.n_estimators)):
            tree = self.trees[i]
            y_and_pred = np.concatenate((y, y_pred), axis=1)
            tree.fit(X, y_and_pred)
            update_pred = tree.predict(X)
            update_pred = np.reshape(update_pred, (m, -1))
            y_pred += update_pred
 
    def predict(self, X):
        y_pred = None
        m = X.shape[0]
        # Make predictions
        for tree in self.trees:
            # Estimate gradient and update prediction
            update_pred = tree.predict(X)
            update_pred = np.reshape(update_pred, (m, -1))
            if y_pred is None:
                y_pred = np.zeros_like(update_pred)
            y_pred += update_pred
 
        return y_pred

3. LightGBM代码实现

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值