sklearn\tree\_splitter.pyx

本文介绍了sklearn库中tree_splitter.pyx模块的工作原理,特别是涉及最优特征分裂的搜索过程。它根据指定的分裂准则(如criterion)评估不同分裂点,以找到最佳分裂。在分裂过程中,会使用node_reset函数初始化,并调用criterion的init函数,例如在回归问题中使用的RegressionCriterion。
摘要由CSDN通过智能技术生成

_splitter用于最优特征分裂的搜索,根据传入的criterion(分裂准则)计算相关衡量指标,确定最有分裂点。

        X : object
            This contains the inputs. Usually it is a 2d numpy array.

        y : ndarray, dtype=DOUBLE_t
            This is the vector of targets, or true labels, for the samples

       对于回归,则是实际值。对于分类则是实际对应的label.

cdef class BestFirstTreeBuilder(TreeBuilder):
    """Build a decision tree in best-first fashion.

    The best node to expand is given by the node at the frontier that has the
    highest impurity improvement.
    """
   ......
   ......

    cdef inline int _add_split_node(self, Splitter splitter, Tree tree,
                                    SIZE_t start, SIZE_t end, double impurity,
                                    bint is_first, bint is_left, Node* parent,
                                    SIZE_t depth,
                                    PriorityHeapRecord* res) nogil except -1:
        """Adds node w/ partition ``[start, end)`` to the frontier. """
        cdef SplitRecord split
        cdef SIZE_t node_id
        cdef SIZE_t n_node_samples
        cdef SIZE_t n_constant_features = 0
        cdef double weighted_n_samples = splitter.weighted_n_samples
        cdef double min_impurity_decrease = self.min_impurity_decrease
        cdef double min_impurity_split = self.min_impurity_split
        cdef double weighted_n_node_samples
        cdef bint is_leaf
        cdef SIZE_t n_left, n_right
        cdef double imp_diff

        splitter.node_reset(start, end, &weighted_n_node_samples)

      对于每一个分裂点,会调用node_reset函数先进行初始化。splitter.node_reset(start, end, &weighted_n_node_samples),在splitter.node_reset(start, end, &weighted_n_node_samples)函数中,会调用 self.criterion.init函数。

 self.criterion.init

       调用criterion的 init函数。

        self.criterion.init(self.y,
                            self.sample_weight,
                            self.weighted_n_samples,
                            self.samples,
                            start,
                            end)

以RegressionCriterion为例,则对应的调用函数如下:

cdef class Splitter:
    """Abstract splitter class.

    Splitters are called by tree builders to find the best splits on both
    sparse and dense data, one split at a time.
    """

    def __cinit__(self, Criterion criterion, SIZE_t max_features,
                  SIZE_t min_samples_leaf, double min_weight_leaf,
                  object random_state):
        """
        Parameters
        ----------
        criterion : Criterion
            The criterion to measure the quality of a split.

        max_features : SIZE_t
            The maximal number of randomly selected features which can be
            considered for a split.

        min_samples_leaf : SIZE_t
            The minimal number of samples each leaf can have, where splits
            which would result in having less samples in a leaf are not
            considered.

        min_weight_leaf : double
            The minimal weight each leaf can have, where the weight is the sum
            of the weights of each sample in it.

        random_state : object
            The user inputted random state to be used for pseudo-randomness
        """

        self.criterion = criterion

        self.samples = NULL
        self.n_samples = 0
        self.features = NULL
        self.n_features = 0
        self.feature_values = NULL

        self.sample_weight = NULL

        self.max_features = max_features
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_leaf = min_weight_leaf
        self.random_state = random_state

    def __dealloc__(self):
        """Destructor."""

        free(self.samples)
        free(self.features)
        free(self.constant_features)
        free(self.feature_values)

    def __getstate__(self):
        return {}

    def __setstate__(self, d):
        pass

    cdef int init(self,
                   object X,
                   const DOUBLE_t[:, ::1] y,
                   DOUBLE_t* sample_weight) except -1:
        """Initialize the splitter.

        Take in the input data X, the target Y, and optional sample weights.

        Returns -1 in case of failure to allocate memory (and raise MemoryError)
        or 0 otherwise.

        Parameters
        ----------
        X : object
            This contains the inputs. Usually it is a 2d numpy array.

        y : ndarray, dtype=DOUBLE_t
            This is the vector of targets, or true labels, for the samples

        sample_weight : DOUBLE_t*
            The weights of the samples, where higher weighted samples are fit
            closer than lower weight samples. If not provided, all samples
            are assumed to have uniform weight.
        """

        self.rand_r_state = self.random_state.randint(0, RAND_R_MAX)
        cdef SIZE_t n_samples = X.shape[0]

        # Create a new array which will be used to store nonzero
        # samples from the feature of interest
        cdef SIZE_t* samples = safe_realloc(&self.samples, n_samples)

        cdef SIZE_t i, j
        cdef double weighted_n_samples = 0.0
        j = 0

        for i in range(n_samples):
            # Only work with positively weighted samples
            if sample_weight == NULL or sample_weight[i] != 0.0:
                samples[j] = i
                j += 1

            if sample_weight != NULL:
                weighted_n_samples += sample_weight[i]
            else:
                weighted_n_samples += 1.0

        # Number of samples is number of positively weighted samples
        self.n_samples = j
        self.weighted_n_samples = weighted_n_samples

        cdef SIZE_t n_features = X.shape[1]
        cdef SIZE_t* features = safe_realloc(&self.features, n_features)

        for i in range(n_features):
            features[i] = i

        self.n_features = n_features

        safe_realloc(&self.feature_values, n_samples)
        safe_realloc(&self.constant_features, n_features)

        self.y = y

        self.sample_weight = sample_weight
        return 0

    cdef int node_reset(self, SIZE_t start, SIZE_t end,
                        double* weighted_n_node_samples) nogil except -1:
        """Reset splitter on node samples[start:end].

        Returns -1 in case of failure to allocate memory (and raise MemoryError)
        or 0 otherwise.

        Parameters
        ----------
        start : SIZE_t
            The index of the first sample to consider
        end : SIZE_t
            The index of the last sample to consider
        weighted_n_node_samples : ndarray, dtype=double pointer
            The total weight of those samples
        """

        self.start = start
        self.end = end

        self.criterion.init(self.y,
                            self.sample_weight,
                            self.weighted_n_samples,
                            self.samples,
                            start,
                            end)

        weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples
        return 0

    cdef int node_split(self, double impurity, SplitRecord* split,
                        SIZE_t* n_constant_features) nogil except -1:
        """Find the best split on node samples[start:end].

        This is a placeholder method. The majority of computation will be done
        here.

        It should return -1 upon errors.
        """

        pass

    cdef void node_value(self, double* dest) nogil:
        """Copy the value of node samples[start:end] into dest."""

        self.criterion.node_value(dest)

    cdef double node_impurity(self) nogil:
        """Return the impurity of the current node."""

        return self.criterion.node_impurity()

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
ImportError Traceback (most recent call last) <ipython-input-3-b25a42d5a266> in <module>() 8 from sklearn.preprocessing import StandardScaler,PowerTransformer 9 from sklearn.linear_model import LinearRegression,LassoCV,LogisticRegression ---> 10 from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor 11 from sklearn.model_selection import KFold,train_test_split,StratifiedKFold,GridSearchCV,cross_val_score 12 from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score,accuracy_score, precision_score,recall_score, roc_auc_score ~\Anaconda3\lib\site-packages\sklearn\ensemble\__init__.py in <module>() 3 classification, regression and anomaly detection. 4 """ ----> 5 from ._base import BaseEnsemble 6 from ._forest import RandomForestClassifier 7 from ._forest import RandomForestRegressor ~\Anaconda3\lib\site-packages\sklearn\ensemble\_base.py in <module>() 16 from ..base import BaseEstimator 17 from ..base import MetaEstimatorMixin ---> 18 from ..tree import DecisionTreeRegressor, ExtraTreeRegressor 19 from ..utils import Bunch, _print_elapsed_time 20 from ..utils import check_random_state ~\Anaconda3\lib\site-packages\sklearn\tree\__init__.py in <module>() 4 """ 5 ----> 6 from ._classes import BaseDecisionTree 7 from ._classes import DecisionTreeClassifier 8 from ._classes import DecisionTreeRegressor ~\Anaconda3\lib\site-packages\sklearn\tree\_classes.py in <module>() 39 from ..utils.validation import check_is_fitted 40 ---> 41 from ._criterion import Criterion 42 from ._splitter import Splitter 43 from ._tree import DepthFirstTreeBuilder sklearn\tree\_criterion.pyx in init sklearn.tree._criterion() ImportError: DLL load failed: 找不到指定的模块。 怎么改
最新发布
07-14
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值