xgboost的特征重要性feature_importance计算

jin_tmac

已于 2024-06-25 11:43:55 修改

阅读量6.5k

点赞数 8

分类专栏：机器学习与数据挖掘 xgboost 文章标签： xgboost 机器学习人工智能

于 2020-05-13 15:24:25 首次发布

本文链接：https://blog.csdn.net/jin_tmac/article/details/106099116

版权

机器学习与数据挖掘同时被 2 个专栏收录

23 篇文章

订阅专栏

xgboost

10 篇文章

订阅专栏

1、sklearn的原生接口和sklearn接口调用feature_importance有差别：
bst = xgb.train(param, d1_train, num_boost_round=100, evals=watch_list)
xgc = xgb.XGBClassifier(objective=‘binary:logistic’, seed=10086, **bst_params)

~~xgc.feature_importances_ 等同于xgc.get_booster().get_fscore()等同于xgc.get_booster().get_score(importance_type=“weight”)~~

而原生接口直接bst.get_fscore()以及bst.get_score(importance_type=“weight”)

2、以sklearn接口为例xgboost的特征重要性有两种途径，如下：
xgc.feature_importances_、xgb.plot_importance(xgc, max_num_features=10)
但是两者输出的结果有差异：
在这里插入图片描述

网上有说其中一个的importance_type是gain，一个是weight。其实不然，查看源码：

    @property
    def feature_importances_(self):
        """
        Returns
        -------
        feature_importances_ : array of shape = [n_features]

        """
        b = self.get_booster()
        fs = b.get_fscore()
        all_features = [fs.get(f, 0.) for f in b.feature_names]
        all_features = np.array(all_features, dtype=np.float32)
        return all_features / all_features.sum()

def plot_importance(booster, ax=None, height=0.2,
                    xlim=None, ylim=None, title='Feature importance',
                    xlabel='F score', ylabel='Features',
                    importance_type='weight', max_num_features=None,
                    grid=True, show_values=True, **kwargs):

    """Plot importance based on fitted trees.

    Parameters
    ----------
    booster : Booster, XGBModel or dict
        Booster or XGBModel instance, or dict taken by Booster.get_fscore()
    ax : matplotlib Axes, default None
        Target axes instance. If None, new figure and axes will be created.
    grid : bool, Turn the axes grids on or off.  Default is True (On).
    importance_type : str, default "weight"
        How the importance is calculated: either "weight", "gain", or "cover"
        "weight" is the number of times a feature appears in a tree
        "gain" is the average gain of splits which use the feature
        "cover" is the average coverage of splits which use the feature
            where coverage is defined as the number of samples affected by the split
    max_num_features : int, default None
        Maximum number of top features displayed on plot. If None, all features will be displayed.
    height : float, default 0.2
        Bar height, passed to ax.barh()
    xlim : tuple, default None
        Tuple passed to axes.xlim()
    ylim : tuple, default None
        Tuple passed to axes.ylim()
    title : str, default "Feature importance"
        Axes title. To disable, pass None.
    xlabel : str, default "F score"
        X axis title label. To disable, pass None.
    ylabel : str, default "Features"
        Y axis title label. To disable, pass None.
    show_values : bool, default True
        Show values on plot. To disable, pass False.
    kwargs :
        Other keywords passed to ax.barh()

    Returns
    -------
    ax : matplotlib Axes
    """
    # TODO: move this to compat.py
    try:
        import matplotlib.pyplot as plt
    except ImportError:
        raise ImportError('You must install matplotlib to plot importance')

    if isinstance(booster, XGBModel):
        importance = booster.get_booster().get_score(importance_type=importance_type)
    elif isinstance(booster, Booster):
        importance = booster.get_score(importance_type=importance_type)
    elif isinstance(booster, dict):
        importance = booster
    else:
        raise ValueError('tree must be Booster, XGBModel or dict instance')

    if len(importance) == 0:
        raise ValueError('Booster.get_score() results in empty')

~~从源码可知importance_type都是‘weight’，就是特征用于分割的次数，只是feature_importances_ 返回值均一化处理了下。~~

class XGBModel(XGBModelBase):
    # pylint: disable=too-many-arguments, too-many-instance-attributes, missing-docstring
    def __init__(self, max_depth=None, learning_rate=None, n_estimators=100,
                 verbosity=None, objective=None, booster=None,
                 tree_method=None, n_jobs=None, gamma=None,
                 min_child_weight=None, max_delta_step=None, subsample=None,
                 colsample_bytree=None, colsample_bylevel=None,
                 colsample_bynode=None, reg_alpha=None, reg_lambda=None,
                 scale_pos_weight=None, base_score=None, random_state=None,
                 missing=np.nan, num_parallel_tree=None,
                 monotone_constraints=None, interaction_constraints=None,
                 importance_type="gain", gpu_id=None,
                 validate_parameters=None, **kwargs):

 def feature_importances_(self):
        """
        Feature importances property
        .. note:: Feature importance is defined only for tree boosters
            Feature importance is only defined when the decision tree model is chosen as base
            learner (`booster=gbtree`). It is not defined for other base learner types, such
            as linear learners (`booster=gblinear`).
        Returns
        -------
        feature_importances_ : array of shape ``[n_features]``
        """
        if self.get_params()['booster'] not in {'gbtree', 'dart'}:
            raise AttributeError(
                'Feature importance is not defined for Booster type {}'
                .format(self.booster))
        b = self.get_booster()
        score = b.get_score(importance_type=self.importance_type)
        all_features = [score.get(f, 0.) for f in b.feature_names]
        all_features = np.array(all_features, dtype=np.float32)
        return all_features / all_features.sum()