导读: DGL kernel中针对Graph的计算几个版本有了不小的变动。
v0.3-0.4使用的是minigun, v0.3和v0.4源码中主要相关部分则是在对应分支dgl/src/kernel目录下。
v0.5中对kernel代码进行了重写,并不再继续使用minigun,可以参见pull 1644。此时DGL将GNN主要抽象为了SPMM和SDDMM两个接口,这里也可以参考DGL v0.5的论文。
后来AWS DGL团队又提出了一个新的基于tvm的kernel FeatGraph并发表在了SC20上,在DGL v0.6中已经部分合入了,在目录dgl/featgraph/。
本文想简单梳理一下DGL各kernel的改动。
ps: v0.5版本后的代码逻辑感觉清晰了很多,更易读一些(完全不会C++的我都能把逻辑流程大概走通)。
v0.5
DGL0.5合并了Homogeneous graph和heterogeneous graph.而且抽象出了spmm和sddmm两个主要接口。python部分代码流程参考之前写的:
文档:DGL0.5中的g-SpMM和g-SDDMM.md
链接:http://note.youdao.com/noteshare?id=91caad64477fbed7263440b994ed18f3&sub=D0139448E815495D84C9EA8E21671ADD
跟到定义sparse._CAPI_DGLKernelSpMM
的地方:
// dgl/src/array/kernel.cc line 154
DGL_REGISTER_GLOBAL("sparse._CAPI_DGLKernelSpMM")
.set_body([] (DGLArgs args, DGLRetValue* rv) {
...
SpMM(op, reduce_op, graph.sptr(), U, E, V, {ArgU, ArgE});
});
SpMM()
的定义:
// dgl/src/array/kernel.cc line 67
/*! \brief Generalized Sparse Matrix-Matrix Multiplication. */
void SpMM(const std::string& op, const std::string& reduce,
HeteroGraphPtr graph,
NDArray ufeat,
NDArray efeat,
NDArray out,
std::vector<NDArray> out_aux) {
// TODO(zihao): format tuning
SparseFormat format = graph->SelectFormat(0, csc_code);
const auto& bcast = CalcBcastOff(op, ufeat, efeat);
ATEN_XPU_SWITCH_CUDA(graph->Context().device_type, XPU, "SpMM", {
ATEN_ID_TYPE_SWITCH(graph->DataType(), IdType, {
ATEN_FLOAT_TYPE_SWITCH(out->dtype, DType, "Feature data", {
if (format == SparseFormat::kCSC) {
SpMMCsr<XPU, IdType, DType>(
op, reduce, bcast, graph->GetCSCMatrix(0),
ufeat, efeat, out, out_aux);
} else if (format == SparseFormat::kCOO) {
SpMMCoo<XPU, IdType, DType>(
op, reduce, bcast, graph->GetCOOMatrix(0),
ufeat, efeat, out, out_aux);
} else {
LOG(FATAL) << "SpMM only supports CSC and COO foramts";
}
});
});
});
}
同样,SDDMM()
的定义也类似:
// dgl/src/array/kernel.cc line 97
/*! \brief Generalized Sampled Dense-Dense Matrix Multiplication. */
void SDDMM(const std::string& op,
HeteroGraphPtr graph,
NDArray lhs,
NDArray rhs,
NDArray out,
int lhs_target,
int rhs_target) {
// TODO(zihao): format tuning
SparseFormat format = graph->SelectFormat(0, coo_code);
const auto &bcast = CalcBcastOff(op, lhs, rhs);
ATEN_XPU_SWITCH_CUDA(graph->Context().device_type, XPU, "SDDMM", {
ATEN_ID_TYPE_SWITCH(graph->DataType(), IdType, {
ATEN_FLOAT_TYPE_SWITCH(out->dtype, DType, "Feature data", {
if (format == SparseFormat::kCSR) {
SDDMMCsr<XPU, IdType, DType>(
op, bcast, graph->GetCSRMatrix(0),
lhs, rhs, out, lhs_target, rhs_target);
} else if (format == SparseFormat::kCOO) {
SDDMMCoo<XPU, IdType, DType>(
op, bcast, graph->GetCOOMatrix(0),
lhs, rhs, out, lhs_target, rhs_target);
} else {
LOG(FATAL) << "SDDMM only supports CSR and COO foramts";
}
});
});
});
}
可以看到他们都支持csr
和coo
两种格式的计算,而v0.4中只看到了支持csr格式的计算。
具体的SpMMCsr
或SDDMMCsr
则定义在/src/array/cpu或/cuda
目录中的spmm
和sddmm
开头的文件中。
(注意,其中的op
定义在/src/array/cpu/spmm_binary_ops.h
中,最终会调用op.call
来计算add
、mul
等。
如果reduce function是sum,则cpu下会调用SpMMSumCsr
或SpMMSumCoo
,不过根据论文中提到的,SpMM应该会选择Csr格式:
// src/array/cpu/spmm.h line 22
/*!
* \brief CPU kernel of SpMM on Csr format.
* \param bcast Broadcast information.
* \param csr The Csr matrix.
* \param ufeat The feature on source nodes.
* \param efeat The feature on edges.
* \param out The result feature on destination nodes.
* \note it uses node parallel strategy, different threads are responsible
* for the computation of different nodes.
*/
template <typename IdType, typename DType, typename Op>
void SpMMSumCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
NDArray efeat, NDArray out) {
const bool has_idx = !IsNullArray(csr.data);
const IdType* indptr = csr.indptr.Ptr<IdType>();
const IdType* indices = csr.indices.Ptr<IdType>();
const IdType* edges = csr.data.Ptr<IdType>();
const DType* X = ufeat.Ptr<DType>();
const DType* W = efeat.Ptr<DType>();
int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len;
DType* O = out.Ptr<DType>();
#if !defined(_WIN32)
typedef dgl::ElemWiseAddUpdate<Op> ElemWiseUpd;
/* Prepare an assembler kernel */
static std::unique_ptr<ElemWiseUpd> asm_kernel_ptr(
(dgl::IntelKernel<>::IsEnabled()) ? new ElemWiseUpd() : nullptr);
/* Distribute the kernel among OMP threads */
ElemWiseUpd* cpu_spec = (asm_kernel_ptr && asm_kernel_ptr->applicable())
? asm_kernel_ptr.get()
: nullptr;
if (cpu_spec && dim > 16 && !bcast.use_bcast) {
#pragma omp parallel for
for (IdType rid = 0; rid < csr.num_rows; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
DType* out_off = O + rid * dim;
std::fill(out_off, out_off + dim, 0);
for (IdType j = row_start; j < row_end; ++j) {
const IdType cid = indices[j];
const IdType eid = has_idx ? edges[j] : j;
cpu_spec->run(out_off, X + cid * lhs_dim, W + eid * rhs_dim, dim);
}
}
} else {
#endif
#pragma omp parallel for
for (IdType rid = 0; rid < csr.num_rows; ++rid) {
const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
DType* out_off = O + rid * dim;
std::fill(out_off, out_off + dim, 0);
for (IdType j = row_start; j < row_end; ++j) {
const IdType cid = indices[j];
const IdType eid = has_idx ? edges[j] : j;
for (int64_t k = 0; k < dim; ++k) {
const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
const DType* lhs_off =
Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
const DType* rhs_off =
Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
out_off[k] += Op::Call(lhs_off, rhs_off);
}
}
}
#if !defined(_WIN32)
}
#endif
}
GPU下则针对csr
和coo
有不同实现:
// /src/array/cuda/spmm.cu line 229
/*!
* \brief CUDA implementation of g-SpMM on Csr format.
* \note use cusparse if the reduce operator is `sum` and there is
* no broadcast, use dgl's kernel in other cases.
*/
template <int XPU, typename IdType, typename DType>
void SpMMCsr(const std::string& op, const std::string& reduce,
const BcastOff& bcast,
const CSRMatrix& csr,
NDArray ufeat,
NDArray efeat,
NDArray out,
std::vector<NDArray> out_aux) {
int64_t feat_len = bcast.out_len;
bool is_scalar_efeat = efeat.NumElements() == csr.indices->shape[0];
bool use_efeat = op != "copy_lhs";
if (reduce == "sum") {
if (sizeof(IdType) == 4 && op == "copy_lhs") { // cusparse
int64_t x_length = 1;
for (int i = 1; i < ufeat->ndim; ++i)
x_length *= ufeat->shape[i];
cusparse::CusparseCsrmm2<DType>(
ufeat->ctx, csr,
static_cast<DType*>(ufeat->data),
nullptr,
static_cast<DType*>(out->data),
x_length);
} else if (sizeof(IdType) == 4 && op == "mul" && is_scalar_efeat) { // cusparse
int64_t x_length = 1;
for (int i = 1; i < ufeat->ndim; ++i)
x_length *= ufeat->shape[i];
if (!IsNullArray(csr.data))
efeat = IndexSelect(efeat, csr.data);
cusparse::CusparseCsrmm2<DType>(
ufeat->ctx, csr,
static_cast<DType*>(ufeat->data),
static_cast<DType*>(efeat->data),
static_cast<DType*>(out->data),
x_length);
} else { // general kernel
SWITCH_OP(op, Op, {
cuda::SpMMCsr<IdType, DType, Op, cuda::reduce::Sum<IdType, DType> >(
bcast, csr, ufeat, efeat, out, NullArray(), NullArray());
});
}
} else if (reduce == "max") {
SWITCH_OP(op, Op, {
cuda::SpMMCsr<IdType, DType, Op, cuda::reduce::Max<IdType, DType> >(
bcast, csr, ufeat, efeat, out, out_aux[0], out_aux[1]);
});
} else if (reduce == "min") {
SWITCH_OP(op, Op, {
cuda::SpMMCsr<IdType, DType, Op, cuda::reduce::Min<IdType, DType> >(
bcast, csr, ufeat, efeat, out, out_aux[0], out_aux[1]);
});
} else {
LOG(FATAL) << "Not implemented";
}
}
/*!
* \brief CUDA implementation of g-SpMM on Coo format.
*/
template <int XPU, typename IdType, typename DType>
void SpMMCoo(const std::string& op, const std::string& reduce,
const BcastOff& bcast,
const COOMatrix& coo,
NDArray ufeat,
NDArray efeat,
NDArray out,
std::vector<NDArray> out_aux) {
if (reduce == "sum") {
SWITCH_OP(op, Op, {
cuda::SpMMCoo<IdType, DType, Op, cuda::reduce::Sum<IdType, DType, true> > (
bcast, coo, ufeat, efeat, out, NullArray(), NullArray());
});
} else if (reduce == "max") {
SWITCH_OP(op, Op, {
cuda::SpMMCoo<IdType, DType, Op, cuda::reduce::Max<IdType, DType, true> > (
bcast, coo, ufeat, efeat, out, out_aux[0], out_aux[1]);
});
} else if (reduce == "min") {
SWITCH_OP(op, Op, {
cuda::SpMMCoo<IdType, DType, Op, cuda::reduce::Min<IdType, DType, true> > (
bcast, coo, ufeat, efeat, out, out_aux[0], out_aux[1]);
});
} else {
LOG(FATAL) << "Not implemented";
}
}
可以看到SpMMCsr
会调用cusparse::CusparseCsrmm2
,而SpMMCoo
会调用cuda::SpMMCoo
,前者就在当前文件中,后者则定义在spmm.cuh
中。
注意,op
定义在/src/array/cuda/functor.cuh
中,最终会调用op.call
来完成add
或mul
等计算。
CUDA_KERNEL_CALL
定义在cuda_common.h line 47.
DGLv0.6
0.6中kernel部分引入了FeatGraph,并且支持了float16,但大体逻辑应该和原来是一样的。FeatGraph在v0.6中单独列了一个文件夹,但内容较少,主要是一个sddmm的demo程序。
网上找了一下,发现了yzh119的这个分支:https://github.com/yzh119/dgl/tree/fg-kernels/src/array,可能是将要合入的版本。下一篇看一下这个版本的代码。
小总结一下:
DGL v0.5相比v0.4:
- v0.4中代码逻辑更为复杂,调用不够清晰(似乎是学的Gunrock?有frontier,applyEdge等),这应该也会带来一定的调用开销。而v0.5中抽象出了spmm和sddmm两个主要接口,代码更加简洁了。
- v0.4中计算只发现了针对csr格式的,v0.5中添加了针对coo格式的计算。
- 并行计算上,CPU上都是通过OpenMP手写的并行计算逻辑。GPU上sddmmm也是手写的并行计算。spmm则如果reduce是sum的话,会使用CuSparse进行计算。
DGL v0.6相比v0.5:
- 大体逻辑应该没有变化,支持了float16(默认应该没有开启).
- GPU上SDDMM多了针对COO格式的Tree Reduction方式(参见:https://github.com/dmlc/dgl/pull/2335)。实际测试了一下,在特征维度稍微高点(64、128)效果确实明显,特征维度特别小时(如4)则效果不是很明显。
- CPU上引入了AVX进行优化。
- 引入了tvm,部分支持了FeatGraph(需要手动编译源码)。