导读: DGL kernel中针对Graph的计算几个版本有了不小的变动。
v0.3-0.4使用的是minigun, v0.3和v0.4源码中主要相关部分则是在对应分支dgl/src/kernel目录下。
v0.5中对kernel代码进行了重写,并不再继续使用minigun,可以参见pull 1644。此时DGL将GNN主要抽象为了SPMM和SDDMM两个接口,这里也可以参考DGL v0.5的论文。
后来AWS DGL团队又提出了一个新的基于tvm的kernel FeatGraph并发表在了SC20上,在DGL v0.6中已经部分合入了,在目录dgl/featgraph/。
本文想简单梳理一下DGL各kernel的改动。
ps: v0.5版本后的代码逻辑感觉清晰了很多,更易读一些(完全不会C++的我都能把逻辑流程大概走通)。
v0.4.3post2
以graph.py中的update_all为入口:
# graph.py line 3233
with ir.prog() as prog:
scheduler.schedule_update_all(graph=AdaptedDGLGraph(self),
message_func=message_func,
reduce_func=reduce_func,
apply_func=apply_node_func)
Runtime.run(prog)
进入scheduler.schedule_update_all():
# scheduler.py line 237
reduced_feat = _gen_send_reduce(src_node_frame=graph.srcframe,
dst_node_frame=graph.dstframe,
edge_frame=graph.edgeframe,
message_func=message_func,
reduce_func=reduce_func,
var_send_edges=var_eid,
var_reduce_nodes=var_recv_nodes,
uv_getter=uv_getter,
adj_creator=adj_creator,
out_map_creator=out_map_creator,
canonical_etype=graph.canonical_etype)
进入_gen_send_reduce(),这里会看传入的message_func和reduce_func是否都是built-in的,都是的话会尝试进行Kernel fusion,减少内存使用并加速计算。否则可能需要先计算出所有message存起来(kernel fusion省显存的地方),再进行reduce操作。这里直接看两者都是builtin时的逻辑:
# scheduler.py line 904
# 3. First try fused message and reduce function
if mfunc_is_list and rfunc_is_list:
# builtin message + builtin reducer
spmv.gen_v2v_spmv_schedule(graph=adj,
mfunc=mfunc,
rfunc=rfunc,
src_frame=var_src_nf,
dst_frame=var_dst_nf,
edge_frame=var_ef,
out=var_out,
out_size=len(reduce_nodes),
edge_map=edge_map,
out_map=out_map)
return var_out
进入spmv.gen_v2v_spmv_schedule():
# spmv.py line 55
ftdst = mfn._invoke(graph, src_frame, dst_frame, edge_frame, out_size,
src_map, dst_map, edge_map, out_map,
reducer=rfn.name)
可以发现调用了message_function._invoke,继续跟进:
# python/dgl/function/message.py line 59
return ir.BINARY_REDUCE(reducer, self.binary_op, graph, self.lhs,
self.rhs, lhs_data, rhs_data, out_size,
lhs_map, rhs_map, out_map)
进到BINARY_REDUCE中
# executor.py line 1091
def BINARY_REDUCE(reducer, binary_op, graph, lhs, rhs, lhs_data, rhs_data,
out_size, lhs_map, rhs_map, out_map, ret=None):
reg = IR_REGISTRY[OpCode.BINARY_REDUCE]
ret = var.new(reg['ret_type']) if ret is None else ret
get_current_prog().issue(reg['executor_cls'](
reducer, binary_op, graph, lhs, rhs, lhs_data, rhs_data, out_size,
lhs_map, rhs_map, out_map, ret))
return ret
之后应该是通过BinaryReduceExecutor来执行了:
# executor.py line 1062
def run(self):
lhs_data = self.lhs_data.data
rhs_data = self.rhs_data.data
ctx = utils.to_dgl_context(F.context(lhs_data))
graph = self.graph.data(ctx)
lhs_map = self.lhs_map.data(ctx) if self.lhs_map.data else None
rhs_map = self.rhs_map.data(ctx) if self.rhs_map.data else None
out_map = self.out_map.data(ctx) if self.out_map.data else None
if not isinstance(lhs_map, tuple):
lhs_map = (lhs_map, lhs_map)
if not isinstance(rhs_map, tuple):
rhs_map = (rhs_map, rhs_map)
if not isinstance(out_map, tuple):
out_map = (out_map, out_map)
self.ret.data = F.binary_reduce(
self.reducer, self.binary_op, graph, self.lhs, self.rhs,
lhs_data, rhs_data, self.out_size, lhs_map, rhs_map, out_map)
这里的F
是backend(backend.py中),应该会根据实际使用的后端(pytorch、tf、mxnet)等来完成后续执行。backend.py中的binary_reduce方法是空的:
# backend.py line 1185
# Note: These operators are supposed to be implemented using DGL-provided
# kernels (see kernel.py), and plug into tensor framework using custom op
# extensions.
def binary_reduce(reducer, binary_op, graph, lhs, rhs, lhs_data, rhs_data,
out_size, lhs_map, rhs_map, out_map):
根据注释内容,需要去看一下kernel.py
,这里应该是调用了kernel.py中的binary_op_reduce():
# kernel.py line 29
def binary_op_reduce(reducer, op, G, A_target, B_target, A, B, out,
A_rows=None, B_rows=None, out_rows=None):
...
_CAPI_DGLKernelBinaryOpReduce(
reducer, op, G,
int(A_target), int(B_target),
A, B, out,
A_rows, B_rows, out_rows)
后续就进入到C++代码部分了,_CAPI_DGLKernelBinaryOpReduce
注册的位置是:
// binary_reduce.cc line 357
DGL_REGISTER_GLOBAL("kernel._CAPI_DGLKernelBinaryOpReduce")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
std::string reducer = args[0];
std::string op = args[1];
int lhs = args[3];
int rhs = args[4];
NDArray lhs_data = args[5];
NDArray rhs_data = args[6];
NDArray out_data = args[7];
NDArray lhs_mapping = args[8];
NDArray rhs_mapping = args[9];
NDArray out_mapping = args[10];
auto f = [&reducer, &op, &lhs, &rhs, &lhs_data, &rhs_data, &out_data,
&lhs_mapping, &rhs_mapping,
&out_mapping](const CSRWrapper& wrapper) {
BinaryOpReduce(reducer, op, wrapper, static_cast<binary_op::Target>(lhs),
static_cast<binary_op::Target>(rhs), lhs_data, rhs_data,
out_data, lhs_mapping, rhs_mapping, out_mapping);
};
csrwrapper_switch(args[2], f);
});
BinaryOpReduce在line 295:
// binary_reduce.cc line 295
void BinaryOpReduce(
const std::string& reducer,
const std::string& op,
const CSRWrapper& graph,
binary_op::Target lhs, binary_op::Target rhs,
NDArray lhs_data, NDArray rhs_data,
NDArray out_data,
NDArray lhs_mapping, NDArray rhs_mapping,
NDArray out_mapping) {
...
DGL_XPU_SWITCH(ctx.device_type, BinaryReduceImpl,
reducer, op, graph,
lhs, rhs,
lhs_data, rhs_data, out_data,
lhs_mapping, rhs_mapping, out_mapping);
}
}
}
这里后续应该是用到了binary_reduce_impl.cc
和binary_reduce_impl.h
中的内容(gpu下对应的是cu和cuh后缀)。而binary_reduce_impl.h
中则引入了#include <minigun/minigun.h>
。
这里具体怎么跳转的有点绕,大概梳理一下:
DGL_XPU_SWITCH
定义在common.h
中,如下:
// common.h line 27
#define DGL_XPU_SWITCH(val, Method, ...) \
if (val == kDLCPU) { \
Method<kDLCPU>(__VA_ARGS__); \
} else if (val == kDLGPU) { \
Method<kDLGPU>(__VA_ARGS__); \
} else { \
LOG(FATAL) << "Unsupported device type: " << val; \
}
可以看到根据传入的ctx.device_type
会选择是CPU还是GPU执行,实际调用的是传入的模板函数Method
,即BinaryReduceImpl
,而BinaryReduceImpl
定义在kernel/binary_reduce_impl.h
中,这里就和minigun
结合起来了:
// binary_reduce_impl.h line 65
template <int XPU>
void BinaryReduceImpl(
const std::string& reducer,
const std::string& op,
const CSRWrapper& graph,
binary_op::Target lhs, binary_op::Target rhs,
runtime::NDArray lhs_data, runtime::NDArray rhs_data,
runtime::NDArray out_data,
runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
runtime::NDArray out_mapping) {
using runtime::NDArray;
using minigun::Csr;
// device
#ifdef __CUDACC__
auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
#endif
const int64_t x_len = utils::ComputeXLength(out_data);
// advance config
minigun::advance::RuntimeConfig rtcfg;
rtcfg.ctx = out_data->ctx;
#ifdef __CUDACC__
rtcfg.stream = thr_entry->stream;
const int nt = utils::FindNumThreads(x_len, 64);
rtcfg.data_num_threads = nt;
// XXX(minjie): hard-code to let each thread compute two elements to increase
// instruction level parallelism
rtcfg.data_num_blocks = (x_len + (nt * 2) - 1) / (nt * 2);
#endif
if (reducer == binary_op::kReduceMean) {
// TODO(minjie): divide
LOG(FATAL) << "reduce mean is not supported.";
}
const DLDataType& dtype = out_data->dtype;
const auto bits = graph.NumBits();
DGL_DTYPE_SWITCH(dtype, DType, {
DGL_IDX_TYPE_SWITCH(bits, Idx, {
REDUCER_SWITCH(reducer, XPU, DType, Reducer, {
auto gdata = AllocGData<XPU, Idx, DType, Reducer>(op,
rtcfg.ctx, x_len, lhs_mapping, rhs_mapping,
lhs_data, rhs_data, out_mapping, out_data);
OP_TARGET_SWITCH(op, lhs, rhs, DType, BinaryOp, LeftTarget, RightTarget, {
CallBinaryReduce<XPU, Idx, DType, LeftTarget,
RightTarget, BinaryOp, Reducer>(rtcfg, graph, &gdata);
});
});
});
});
}
这部分代码,如果是用GPU执行的话,还在minigun
的RuntimeConfig
(位于minigun
源码advance.h
中)中定义了blocks
和threads
的数量。最终会调用CallBinaryReduce()
,这部分CPU版代码位于kernel/cpu/binary_reduce_impl.h
中:
// kernel/cpu/binary_reduce_impl.h line 143
// Template implementation of BinaryReduce operator.
template <int XPU, typename Idx, typename DType,
typename LeftSelector, typename RightSelector,
typename BinaryOp, typename Reducer>
void CallBinaryReduce(const minigun::advance::RuntimeConfig& rtcfg,
const CSRWrapper& graph,
GData<Idx, DType>* gdata) {
typedef cpu::FunctorsTempl<Idx, DType, LeftSelector,
RightSelector, BinaryOp, Reducer>
Functors;
typedef cpu::BinaryReduce<Idx, DType, Functors> UDF;
...
minigun::advance::Advance<XPU, Idx, cpu::AdvanceConfig, GData<Idx, DType>, UDF>(
rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
}
可以看到最终调用了minigun::advance::Advance()
,代码位于advance.h
,另外,这段代码得到的UDF(即同目录下的BinaryReduce,位于line 24)很重要,因为这个UDF会一直传到最底层minigun中,并根据它来调用计算。(UDF的源码位于dgl/src/kernel/cpu或cuda中
,minigun
则是作为第三方库引入的,不太方便来回看)
// minigun/minigun/advance.h line 60
/*!
* \brief Advance kernel.
*
* \tparam XPU The computing device type (DLDeviceType)
* \tparam Idx The type of the index (usually int32_t or int64_t)
* \tparam Config The static configuration of advance kernel.
* \tparam GData The user-defined graph data.
* \tparam Functor The user-defined functions.
* \tparam Alloc The external allocator type.
* \param config Runtime configuration of this advance kernel.
* \param csr The graph csr structure.
* \param gdata The pointer to the user-defined graph data structure.
* This pointer must be a host pointer and it will be
* dereferenced and its content will be copied to the
* device for execution.
* \param alloc The external memory allocator.
*/
template <int XPU,
typename Idx,
typename DType,
typename Config,
typename GData,
typename Functor,
typename Alloc = DefaultAllocator<XPU> >
void Advance(const RuntimeConfig& config,
const SpMat<Idx>& spmat,
GData* gdata,
Alloc* alloc = DefaultAllocator<XPU>::Get()) {
DispatchXPU<XPU, Idx, DType, Config, GData, Functor, Alloc>::Advance(
config, spmat, gdata, alloc);
}
同样,通过device类型最终会调用minigun/cpu/advance_all.h
中的CPUAdvanceAll
或minigun/cuda/advance_all.cuh
中的CudaAdvanceAll
,CPUAdvanceAll
或CudaAdvanceAll
中则会根据情况选择不同的并行条件来执行:
注意: https://github.com/dglai/minigun
中的代码和通过git clone --recurse-submodules -b 0.4.x https://github.com/dmlc/dgl.git
下载的代码似乎有点对不上。
minigun单独的仓库中的代码:
// minigun单独的仓库中的代码: cpu/advance_all.h line 102
void CPUAdvanceAll(
const SpMat<Idx>& spmat,
GData* gdata,
Alloc* alloc) {
switch (Config::kParallel) {
case kSrc:
if (spmat.out_csr != nullptr)
CPUAdvanceAllNodeParallel<Idx, DType, Config, GData, Functor, Alloc>
(*spmat.out_csr, gdata);
else
LOG(FATAL) << "out_csr need to be created in source parallel mode.";
break;
case kEdge:
if (spmat.coo != nullptr)
CPUAdvanceAllEdgeParallel<Idx, DType, Config, GData, Functor, Alloc>
(*spmat.coo, gdata);
else if (spmat.out_csr != nullptr)
CPUAdvanceAllEdgeParallelCSR<Idx, DType, Config, GData, Functor, Alloc>
(*spmat.out_csr, gdata, true);
else if (spmat.in_csr != nullptr)
CPUAdvanceAllEdgeParallelCSR<Idx, DType, Config, GData, Functor, Alloc>
(*spmat.in_csr, gdata, false);
else
LOG(FATAL) << "At least one sparse format should be created.";
break;
case kDst:
if (spmat.in_csr != nullptr)
CPUAdvanceAllNodeParallel<Idx, DType, Config, GData, Functor, Alloc>
(*spmat.in_csr, gdata);
else
LOG(FATAL) << "in_csr need to be created in destination parallel mode.";
break;
}
}
通过git打包下载DGL0.4.x的代码,其中没有头文件cpu/advance_all.h
,但是有cuda/advance_all.cuh
,CPU_Advance
是直接定义在cpu/advance.h
中的(猜想可能是v0.5本来就要对这部分代码重写了吧,CPU大家用的也不太多,就没在v0.4把CPU上的优化再加进去),如果从头到尾把整个流程走一遍,使用git打包下载的才是正确的,不然会有些地方对不上,例如前一段代码中的Config::kParallel
和Config::kAdvanceAll
是对应不上的:
// third-party/minigun/minigun/cpu/advance.h line 66
template <typename Idx,
typename Config,
typename GData,
typename Functor,
typename Alloc>
void CPUAdvance(Csr<Idx> csr,
GData* gdata,
IntArray1D<Idx> input_frontier,
IntArray1D<Idx> output_frontier,
IntArray1D<Idx> lcl_row_offsets,
Alloc* alloc) {
Idx N = Config::kAdvanceAll ? csr.row_offsets.length - 1 : input_frontier.length;
#pragma omp parallel for
for (Idx vid = 0; vid < N; ++vid) {
Idx src = vid;
if (!Config::kAdvanceAll) {
src = input_frontier.data[vid];
}
const Idx row_start = csr.row_offsets.data[src];
const Idx row_end = csr.row_offsets.data[src + 1];
for (Idx eid = row_start; eid < row_end; ++eid) {
const Idx dst = csr.column_indices.data[eid];
if (Functor::CondEdge(src, dst, eid, gdata)) {
Functor::ApplyEdge(src, dst, eid, gdata);
if (Config::kMode != kV2N && Config::kMode != kE2N) {
Idx out_idx;
if (Config::kAdvanceAll) {
out_idx = eid;
} else {
out_idx = eid - row_start + lcl_row_offsets.data[vid];
}
if (Config::kMode == kV2V || Config::kMode == kE2V) {
output_frontier.data[out_idx] = dst;
} else {
output_frontier.data[out_idx] = eid;
}
}
} else {
if (Config::kMode != kV2N && Config::kMode != kE2N) {
Idx out_idx;
if (Config::kAdvanceAll) {
out_idx = eid;
} else {
out_idx = eid - row_start + lcl_row_offsets.data[vid];
}
output_frontier.data[out_idx] = MG_INVALID;
}
}
}
}
}
GPU中也同理,但需要注意的是binary_reduce_sum.cu
中有一个分支直接会使用CuSparse来计算:
// binary_reduce_sum.cu line 228
template <>
void CallBinaryReduce<kDLGPU, int32_t, float, SelectSrc, SelectNone,
BinaryUseLhs<float>, ReduceSum<kDLGPU, float>>(
const RuntimeConfig& rtcfg,
const CSRWrapper& graph,
GData<int32_t, float>* gdata) {
if (gdata->lhs_mapping || gdata->rhs_mapping || gdata->out_mapping) {
cuda::FallbackCallBinaryReduce<float>(rtcfg, graph, gdata);
} else {
// cusparse use rev csr for csrmm
auto csr = graph.GetInCSRMatrix();
cuda::CusparseCsrmm2(rtcfg, csr, gdata->lhs_data, gdata->out_data,
gdata->x_length);
}
}
回到CPUAdvanceAll
中,可以发现主要有两种并行策略:CPUAdvanceAllNodeParallel
和CPUAdvanceAllEdgeParallel
。
CPUAdvanceAllNodeParallel
如下:
// minigun/cpu/advance_all.h line 62
void CPUAdvanceAllNodeParallel(
const Csr<Idx>& csr,
GData *gdata) {
const Idx N = csr.row_offsets.length - 1;
const Idx feat_size = Functor::GetFeatSize(gdata);
DType *outbuf = Functor::GetOutBuf(gdata);
#pragma omp parallel for
for (Idx vid = 0; vid < N; ++vid) {
const Idx start = csr.row_offsets.data[vid];
const Idx end = csr.row_offsets.data[vid + 1];
if (start < end) {
for (Idx feat_idx = 0; feat_idx < feat_size; ++feat_idx) {
DType val = static_cast<DType>(0);
const Idx outoff = Functor::GetOutOffset(vid, gdata) * feat_size + feat_idx;
if (outbuf != nullptr)
val = outbuf[outoff];
for (Idx eid = start; eid < end; ++eid) {
Idx src, dst;
if (Config::kParallel == kDst) {
src = csr.column_indices.data[eid];
dst = vid;
} else { // kSrc
dst = csr.column_indices.data[eid];
src = vid;
}
Functor::ApplyEdgeReduce(src, dst, eid, feat_idx, &val, gdata);
}
if (outbuf != nullptr)
outbuf[outoff] = val;
}
}
}
}
可以发现通过OpenMP手写了并行计算(for循环是针对node的),最后还会调用一下Functor::ApplyEdgeReduce
。ApplyEdgeReduce
就在DGL源码中没有找到。
CPUAdvanceAllEdgeParallel
如下:
// minigun/cpu/advance_all.h line 44
void CPUAdvanceAllEdgeParallel(
const Coo<Idx>& coo,
GData *gdata) {
const Idx E = coo.column.length;
#pragma omp parallel for
for (Idx eid = 0; eid < E; ++eid) {
const Idx src = coo.row.data[eid];
const Idx dst = coo.column.data[eid];
Functor::ApplyEdge(src, dst, eid, gdata);
}
}
同样通过OpenMP手写了并行计算(for循环是针对Edge的),而且也需要调用Functor::ApplyEdge()
。
这里的Functor就是前面所说的,从CallBinaryReduce
一直传下来的,ApplyEdge
实际在/src/kernel/cpu/binary_reduce_impl.h
中:
// /src/kernel/cpu/binary_reduce_impl.h line 22
// Minigun UDF to compute binary reduce.
template <typename Idx, typename DType, typename Functors>
struct BinaryReduce {
static inline bool CondEdge(
Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
return true;
}
static inline void ApplyEdge(
Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
const int64_t D = gdata->x_length;
const int64_t len = gdata->data_len;
Idx lid = Functors::SelectLeft(src, eid, dst);
Idx rid = Functors::SelectRight(src, eid, dst);
Idx oid = Functors::SelectOut(src, eid, dst);
if (gdata->lhs_mapping) {
lid = Functors::GetId(lid, gdata->lhs_mapping);
}
if (gdata->rhs_mapping) {
rid = Functors::GetId(rid, gdata->rhs_mapping);
}
if (gdata->out_mapping) {
oid = Functors::GetId(oid, gdata->out_mapping);
}
DType* lhsoff = gdata->lhs_data + lid * D * len;
DType* rhsoff = gdata->rhs_data + rid * D * len;
DType* outoff = gdata->out_data + oid * D;
for (int64_t tx = 0; tx < D; ++tx) {
DType out = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len);
Functors::Write(outoff + tx, out);
}
}
};
这里相当于又回到了/src/kernel/中,而且可以看到,针对特征D,其是每个数每个数算的,即最后一个for循环。
------------------------------分割线--------------------------
下面重点看一下CPU和GPU上,具体是怎么计算的,以git打包下载的DGL为准。
CPU下的:
// third-party/minigun/cpu/advance.h
template <typename Idx,
typename Config,
typename GData,
typename Functor,
typename Alloc>
void CPUAdvance(Csr<Idx> csr,
GData* gdata,
IntArray1D<Idx> input_frontier,
IntArray1D<Idx> output_frontier,
IntArray1D<Idx> lcl_row_offsets,
Alloc* alloc) {
Idx N = Config::kAdvanceAll ? csr.row_offsets.length - 1 : input_frontier.length;
#pragma omp parallel for
// OpenMP并行计算,可以看到for循环是根据node来并行的
for (Idx vid = 0; vid < N; ++vid) {
Idx src = vid;
if (!Config::kAdvanceAll) {
src = input_frontier.data[vid];
}
// 获取当前计算node的所有边的offsets
const Idx row_start = csr.row_offsets.data[src];
const Idx row_end = csr.row_offsets.data[src + 1];
for (Idx eid = row_start; eid < row_end; ++eid) {
const Idx dst = csr.column_indices.data[eid];
// CondEdge()似乎总是返回True?
if (Functor::CondEdge(src, dst, eid, gdata)) {
Functor::ApplyEdge(src, dst, eid, gdata);
// Kmode默认好像就是KV2N
if (Config::kMode != kV2N && Config::kMode != kE2N) {
Idx out_idx;
if (Config::kAdvanceAll) {
out_idx = eid;
} else {
out_idx = eid - row_start + lcl_row_offsets.data[vid];
}
if (Config::kMode == kV2V || Config::kMode == kE2V) {
output_frontier.data[out_idx] = dst;
} else {
output_frontier.data[out_idx] = eid;
}
}
} else {
if (Config::kMode != kV2N && Config::kMode != kE2N) {
Idx out_idx;
if (Config::kAdvanceAll) {
out_idx = eid;
} else {
out_idx = eid - row_start + lcl_row_offsets.data[vid];
}
output_frontier.data[out_idx] = MG_INVALID;
}
}
}
}
}
CondEdge()和ApplyEdge()源码:
// src/kernel/cpu/binary_reduce_impl.h line 22
// Minigun UDF to compute binary reduce.
template <typename Idx, typename DType, typename Functors>
struct BinaryReduce {
static inline bool CondEdge(
Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
return true;
}
static inline void ApplyEdge(
Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
const int64_t D = gdata->x_length; //特征维度
const int64_t len = gdata->data_len; //data size? gdata定义在binary_reduce_impl_decl.h中
Idx lid = Functors::SelectLeft(src, eid, dst);
Idx rid = Functors::SelectRight(src, eid, dst);
Idx oid = Functors::SelectOut(src, eid, dst);
if (gdata->lhs_mapping) {
lid = Functors::GetId(lid, gdata->lhs_mapping);
}
if (gdata->rhs_mapping) {
rid = Functors::GetId(rid, gdata->rhs_mapping);
}
if (gdata->out_mapping) {
oid = Functors::GetId(oid, gdata->out_mapping);
}
// lhs_data, rhs_data都是输入数据,out_data是输出数据
DType* lhsoff = gdata->lhs_data + lid * D * len;
DType* rhsoff = gdata->rhs_data + rid * D * len;
DType* outoff = gdata->out_data + oid * D;
// 可以看到对特征,是每个标量分别计算的
for (int64_t tx = 0; tx < D; ++tx) {
// Functors::Op实际会调用functor.h中的各个functor的call()方法,
// 最初的func是在src/kernel/binary_reduce.impl.h line 106通过OP_TARGET_SWITCH注册的,
// 原始的func定义在binary_reduce_common.h中
DType out = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len);
// 写回计算结果
Functors::Write(outoff + tx, out);
}
}
};
GPU下:
// third_party/minigun/cuda/advance.cuh line 21
struct DispatchXPU<kDLGPU, Idx, Config, GData, Functor, Alloc> {
static void Advance(
const RuntimeConfig& rtcfg,
const Csr<Idx>& csr,
GData* gdata,
IntArray1D<Idx> input_frontier,
IntArray1D<Idx>* output_frontier,
Alloc* alloc) {
// Call advance
if (Config::kAdvanceAll) {
AdvanceAlg algo = FindAdvanceAllAlgo<Idx, Config>(rtcfg, csr);
CudaAdvanceAll<Idx, Config, GData, Functor, Alloc>(
algo, rtcfg, csr, gdata, output_frontier, alloc);
} else {
#if ENABLE_PARTIAL_FRONTIER
AdvanceAlg algo = FindAdvanceAlgo<Idx, Config>(rtcfg, csr,
input_frontier);
CudaAdvanceExecutor<Idx, Config, GData, Functor, Alloc> exec(
algo, rtcfg, csr, gdata, input_frontier, output_frontier, alloc);
exec.Run();
#else
LOG(FATAL) << "Partial frontier is not supported for CUDA 10.0";
#endif
}
}
};
// advance_all.cuh line 93
template <typename Idx,
typename Config,
typename GData,
typename Functor,
typename Alloc>
void CudaAdvanceAll(
AdvanceAlg algo,
const RuntimeConfig& rtcfg,
const Csr<Idx>& csr,
GData* gdata,
IntArray1D<Idx>* output_frontier,
Alloc* alloc) {
Idx out_len = csr.column_indices.length;
if (output_frontier) {
if (output_frontier->data == nullptr) {
// Allocate output frointer buffer, the length is equal to the number
// of edges.
output_frontier->length = out_len;
output_frontier->data = alloc->template AllocateData<Idx>(
output_frontier->length * sizeof(Idx));
} else {
CHECK_GE(output_frontier->length, out_len)
<< "Require output frontier of length " << out_len
<< " but only got a buffer of length " << output_frontier->length;
}
}
IntArray1D<Idx> outbuf = (output_frontier)? *output_frontier : IntArray1D<Idx>();
switch (algo) {
case kGunrockLBOut :
CudaAdvanceAllGunrockLBOut<Idx, Config, GData, Functor, Alloc>(
rtcfg, csr, gdata, outbuf, alloc);
break;
default:
LOG(FATAL) << "Algorithm " << algo << " is not supported.";
}
}
// advance_all.cuh line 68
#define MAX_NTHREADS 1024
#define PER_THREAD_WORKLOAD 1
#define MAX_NBLOCKS 65535
template <typename Idx,
typename Config,
typename GData,
typename Functor,
typename Alloc>
void CudaAdvanceAllGunrockLBOut(
const RuntimeConfig& rtcfg,
const Csr<Idx>& csr,
GData* gdata,
IntArray1D<Idx> output_frontier,
Alloc* alloc) {
CHECK_GT(rtcfg.data_num_blocks, 0);
CHECK_GT(rtcfg.data_num_threads, 0);
const Idx M = csr.column_indices.length; // graph边的数量
// data_num_threads根据特征大小得到的,不超过特征维度。64->32->16.
// MAX_NTHREADS是一个线程块最多的线程数量1024.
const int ty = MAX_NTHREADS / rtcfg.data_num_threads;
const int ny = ty * PER_THREAD_WORKLOAD;
const int by = std::min((M + ny - 1) / ny, static_cast<Idx>(MAX_NBLOCKS));
const dim3 nblks(rtcfg.data_num_blocks, by);
const dim3 nthrs(rtcfg.data_num_threads, ty);
//LOG(INFO) << "Blocks: (" << nblks.x << "," << nblks.y << ") Threads: ("
//<< nthrs.x << "," << nthrs.y << ")";
CudaAdvanceAllGunrockLBOutKernel<Idx, Config, GData, Functor>
<<<nblks, nthrs, 0, rtcfg.stream>>>(csr, *gdata, output_frontier);
}
// advance_all.cuh line 34
template <typename Idx,
typename Config,
typename GData,
typename Functor>
__global__ void CudaAdvanceAllGunrockLBOutKernel(
Csr<Idx> csr,
GData gdata,
IntArray1D<Idx> output_frontier) {
Idx ty = blockIdx.y * blockDim.y + threadIdx.y;
Idx stride_y = blockDim.y * gridDim.y;
Idx eid = ty;
while (eid < csr.column_indices.length) {
// TODO(minjie): this is pretty inefficient; binary search is needed only
// when the thread is processing the neighbor list of a new node.
Idx src = BinarySearchSrc(csr.row_offsets, eid);
Idx dst = _ldg(csr.column_indices.data + eid);
if (Functor::CondEdge(src, dst, eid, &gdata)) {
Functor::ApplyEdge(src, dst, eid, &gdata);
// Add dst/eid to output frontier
if (Config::kMode == kV2V || Config::kMode == kE2V) {
output_frontier.data[eid] = dst;
} else if (Config::kMode == kV2E || Config::kMode == kE2E) {
output_frontier.data[eid] = eid;
}
} else {
if (Config::kMode != kV2N && Config::kMode != kE2N) {
// Add invalid to output frontier
output_frontier.data[eid] = MG_INVALID;
}
}
eid += stride_y;
}
};
如果reduce是sum的话,则不走minigun,即不走src/kernel/cuda/binary_reduce_impl.cuh
中的逻辑,而可以直接走src/kernel/cuda/binary_reduce_sum.cu
中的CallBinaryReduce
,使用cuSparse来计算:
// src/kernel/cuda/binary_reduce_sum.cu line 229
template <>
void CallBinaryReduce<kDLGPU, int32_t, float, SelectSrc, SelectNone,
BinaryUseLhs<float>, ReduceSum<kDLGPU, float>>(
const RuntimeConfig& rtcfg,
const CSRWrapper& graph,
GData<int32_t, float>* gdata) {
// 满足该if还是会跳转到minigun中,
// 满足条件时这些成员已经不为空了,说明该graph已经使用minigun计算过了?
// 因此需要继续使用minigun计算?
if (gdata->lhs_mapping || gdata->rhs_mapping || gdata->out_mapping) {
cuda::FallbackCallBinaryReduce<float>(rtcfg, graph, gdata);
} else {
// cusparse use rev csr for csrmm
auto csr = graph.GetInCSRMatrix();
cuda::CusparseCsrmm2(rtcfg, csr, gdata->lhs_data, gdata->out_data,
gdata->x_length);
}
}
这里还有着GPU并行计算的配置,详细分析一下:
// advance.h line 21
struct RuntimeConfig {
// device context
DLContext ctx;
// the advance algorithm to use
AdvanceAlg alg = kAuto;
// number of thread blocks to process data dimension
int data_num_blocks = 0;
// number of threads per block to process data dimension
int data_num_threads = 0;
#ifdef __CUDACC__
cudaStream_t stream{nullptr};
#endif // __CUDACC__
};
最初定义了runtimeconfig的地方:
// /src/kernel/binary_reduce_impl.h line 187
#ifdef __CUDACC__
rtcfg.stream = thr_entry->stream;
const int nt = utils::FindNumThreads(x_len, 64);
rtcfg.data_num_threads = nt;
// XXX(minjie): hard-code to let each thread compute two elements to increase
// instruction level parallelism
rtcfg.data_num_blocks = (x_len + (nt * 2) - 1) / (nt * 2);
#endif
可以发现是根据特征维度x_len
来计算的。utils::FindNumThreads
定义如下:
// utils.cc line 17
int FindNumThreads(int dim, int max_nthrs) {
int ret = max_nthrs;
while (ret > dim) {
ret = ret >> 1;
}
return ret;
}
输入的max_nthrs为64,即找到比特征维度小的2的指数倍值,最大为64.
TODO: 这里CUDA编程还需要熟悉一下。
TODO: 看一眼Apply_edges,生成的是V2E
参考:https://github.com/dmlc/dgl/pull/596
TODO:看一下v0.6版本和feag-dgl版本