template<typename FPType, bool do_prefetch, typename BinIdxType>
void BuildHistDenseKernel(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
const size_t n_features,
GHistRow hist)
{
const size_t size = row_indices.Size();
const size_t* rid = row_indices.begin;
const float* pgh = reinterpret_cast<const float*>(gpair.data());
const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();//返回指向binidxtype类型的指针,默认为data_ptr_
const uint32_t* offsets = gmat.index.Offset();
FPType* hist_data = reinterpret_cast<FPType*>(hist.data());
const uint32_t two {2}; // Each element from 'gpair' and 'hist' contains
// 2 FP values: gradient and hessian.
// So we need to multiply each row-index/bin-index by 2
// to work with gradient pairs as a singe row FP array
for (size_t i = 0; i < size; ++i) {
const size_t icol_start = rid[i] * n_features;
const size_t idx_gh = two * rid[i];
if (do_prefetch) {
const size_t icol_start_prefetch = rid[i + Prefetch::kPrefetchOffset] * n_features;
PREFETCH_READ_T0(pgh + two * rid[i + Prefetch::kPrefetchOffset]);
for (size_t j = icol_start_prefetch; j < icol_start_prefetch + n_features;
j += Prefetch::GetPrefetchStep<BinIdxType>()) {
PREFETCH_READ_T0(gradient_index + j);
}
}
const BinIdxType* gr_index_local = gradient_index + icol_start;
for (size_t j = 0; j < n_features; ++j) {
const uint32_t idx_bin = two * (static_cast<uint32_t>(gr_index_local[j]) +
offsets[j]);
hist_data[idx_bin] += pgh[idx_gh];
hist_data[idx_bin+1] += pgh[idx_gh+1];
}
}
}
函数模板参数分析
FPTtype//传入数据类型FPTtype:猜测是表示特征的类似,在spark的接口里传入的类型是double
\\在文件夹Spark-ml-algo-lib中
void BuildHistDenseKernel<double, false, uint8_t>(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
const size_t n_features,
GHistRow hist)
-
传入Bool类型的数据
do_prefetch//判断是否提前抓取
; -
传入数据类型
BinIdexType//桶的下标数据类型
,
函数参数分析
(const
都是在BuildHistDenseKernel
中保持不变的参数)
const std::vector<GradientPair>& gpair//类型为GradientPair类型的向量容器
其中using xgboost::GradientPair = typedef detail::GradientPairInternal<float> //存储grad和hess
参考链接:RowSetCollectionconst RowSetCollection::Elem row_indices
Elem:data structure to store an instance set, a subset of rows (instances) associated with a particular node in a decision tree
存储实例集合的数据结构,实例的子集与决策树的特定节点有关
参考链接:RowSetCollectionconst GHistIndexMatrix& gmat
GHistIndexMatrix :brief preprocessed global index matrix, in CSR format
Transform floating values to integer index in histogram This is a global histogram index for CPU histogram. On GPU ellpack page is used.
将全局的索引矩阵,以稀疏矩阵的格式存储,在直方图上将浮点数数据转化为整型的索引,在cpu直方图上,这是一个全局的直方图索引。const size_t n_features
size_t 无符号的整型GHistRow hist
/*brief histogram of gradient statistics for a single node.
Consists of multiple GradStats, each entry showing total gradient statistics
for that particular bin
Uses global bin id so as to represent all features simultaneously
*/
单个节点的梯度统计信息的简要直方图。由多个GradStats组成,对于该特定桶,每个条目显示总梯度统计数据使用全局 bin ID,以便同时表示所有特征。
其中using xgboost::common::GHistRow = typedef Span<xgboost::GradientPairPrecise>
参考链接:span