_splitter用于最优特征分裂的搜索,根据传入的criterion(分裂准则)计算相关衡量指标,确定最有分裂点。
X : object
This contains the inputs. Usually it is a 2d numpy array.
y : ndarray, dtype=DOUBLE_t
This is the vector of targets, or true labels, for the samples
对于回归,则是实际值。对于分类则是实际对应的label.
cdef class BestFirstTreeBuilder(TreeBuilder):
"""Build a decision tree in best-first fashion.
The best node to expand is given by the node at the frontier that has the
highest impurity improvement.
"""
......
......
cdef inline int _add_split_node(self, Splitter splitter, Tree tree,
SIZE_t start, SIZE_t end, double impurity,
bint is_first, bint is_left, Node* parent,
SIZE_t depth,
PriorityHeapRecord* res) nogil except -1:
"""Adds node w/ partition ``[start, end)`` to the frontier. """
cdef SplitRecord split
cdef SIZE_t node_id
cdef SIZE_t n_node_samples
cdef SIZE_t n_constant_features = 0
cdef double weighted_n_samples = splitter.weighted_n_samples
cdef double min_impurity_decrease = self.min_impurity_decrease
cdef double min_impurity_split = self.min_impurity_split
cdef double weighted_n_node_samples
cdef bint is_leaf
cdef SIZE_t n_left, n_right
cdef double imp_diff
splitter.node_reset(start, end, &weighted_n_node_samples)
对于每一个分裂点,会调用node_reset函数先进行初始化。splitter.node_reset(start, end, &weighted_n_node_samples),在splitter.node_reset(start, end, &weighted_n_node_samples)函数中,会调用 self.criterion.init函数。
self.criterion.init
调用criterion的 init函数。
self.criterion.init(self.y,
self.sample_weight,
self.weighted_n_samples,
self.samples,
start,
end)
以RegressionCriterion为例,则对应的调用函数如下:
cdef class Splitter:
"""Abstract splitter class.
Splitters are called by tree builders to find the best splits on both
sparse and dense data, one split at a time.
"""
def __cinit__(self, Criterion criterion, SIZE_t max_features,
SIZE_t min_samples_leaf, double min_weight_leaf,
object random_state):
"""
Parameters
----------
criterion : Criterion
The criterion to measure the quality of a split.
max_features : SIZE_t
The maximal number of randomly selected features which can be
considered for a split.
min_samples_leaf : SIZE_t
The minimal number of samples each leaf can have, where splits
which would result in having less samples in a leaf are not
considered.
min_weight_leaf : double
The minimal weight each leaf can have, where the weight is the sum
of the weights of each sample in it.
random_state : object
The user inputted random state to be used for pseudo-randomness
"""
self.criterion = criterion
self.samples = NULL
self.n_samples = 0
self.features = NULL
self.n_features = 0
self.feature_values = NULL
self.sample_weight = NULL
self.max_features = max_features
self.min_samples_leaf = min_samples_leaf
self.min_weight_leaf = min_weight_leaf
self.random_state = random_state
def __dealloc__(self):
"""Destructor."""
free(self.samples)
free(self.features)
free(self.constant_features)
free(self.feature_values)
def __getstate__(self):
return {}
def __setstate__(self, d):
pass
cdef int init(self,
object X,
const DOUBLE_t[:, ::1] y,
DOUBLE_t* sample_weight) except -1:
"""Initialize the splitter.
Take in the input data X, the target Y, and optional sample weights.
Returns -1 in case of failure to allocate memory (and raise MemoryError)
or 0 otherwise.
Parameters
----------
X : object
This contains the inputs. Usually it is a 2d numpy array.
y : ndarray, dtype=DOUBLE_t
This is the vector of targets, or true labels, for the samples
sample_weight : DOUBLE_t*
The weights of the samples, where higher weighted samples are fit
closer than lower weight samples. If not provided, all samples
are assumed to have uniform weight.
"""
self.rand_r_state = self.random_state.randint(0, RAND_R_MAX)
cdef SIZE_t n_samples = X.shape[0]
# Create a new array which will be used to store nonzero
# samples from the feature of interest
cdef SIZE_t* samples = safe_realloc(&self.samples, n_samples)
cdef SIZE_t i, j
cdef double weighted_n_samples = 0.0
j = 0
for i in range(n_samples):
# Only work with positively weighted samples
if sample_weight == NULL or sample_weight[i] != 0.0:
samples[j] = i
j += 1
if sample_weight != NULL:
weighted_n_samples += sample_weight[i]
else:
weighted_n_samples += 1.0
# Number of samples is number of positively weighted samples
self.n_samples = j
self.weighted_n_samples = weighted_n_samples
cdef SIZE_t n_features = X.shape[1]
cdef SIZE_t* features = safe_realloc(&self.features, n_features)
for i in range(n_features):
features[i] = i
self.n_features = n_features
safe_realloc(&self.feature_values, n_samples)
safe_realloc(&self.constant_features, n_features)
self.y = y
self.sample_weight = sample_weight
return 0
cdef int node_reset(self, SIZE_t start, SIZE_t end,
double* weighted_n_node_samples) nogil except -1:
"""Reset splitter on node samples[start:end].
Returns -1 in case of failure to allocate memory (and raise MemoryError)
or 0 otherwise.
Parameters
----------
start : SIZE_t
The index of the first sample to consider
end : SIZE_t
The index of the last sample to consider
weighted_n_node_samples : ndarray, dtype=double pointer
The total weight of those samples
"""
self.start = start
self.end = end
self.criterion.init(self.y,
self.sample_weight,
self.weighted_n_samples,
self.samples,
start,
end)
weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples
return 0
cdef int node_split(self, double impurity, SplitRecord* split,
SIZE_t* n_constant_features) nogil except -1:
"""Find the best split on node samples[start:end].
This is a placeholder method. The majority of computation will be done
here.
It should return -1 upon errors.
"""
pass
cdef void node_value(self, double* dest) nogil:
"""Copy the value of node samples[start:end] into dest."""
self.criterion.node_value(dest)
cdef double node_impurity(self) nogil:
"""Return the impurity of the current node."""
return self.criterion.node_impurity()