CUDA Graph
1. cuda graph的使用场景
- cuda graph在一个kernel要多次执行,且每次只更改kernel 参数或者不更改参数时使用效果更加;但是如果将graph替换已有的kernel组合,且没有重复执行,感觉效率不是很高反而低于原始的kernel调用;【此外, graph启动还需要耗时】
2. 使用方式
2.1 stream capture 方式
- 基本范式, 通过start capture 和end Capture 以及 构建graph exec方式实现graph执行,效率不高;用于graph多次执行的情况。ref: cuda_sample: jacobi
- 不需要GraphCreate 一个graph对象。cudaStreamEndCapture 会直接创建一个graph。
checkCudaErrors(
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal));
checkCudaErrors(cudaMemsetAsync(d_sum, 0, sizeof(double), stream));
if ((k & 1) == 0) {
JacobiMethod<<<nblocks, nthreads, 0, stream>>>(A, b, conv_threshold, x,
x_new, d_sum);
} else {
JacobiMethod<<<nblocks, nthreads, 0, stream>>>(A, b, conv_threshold,
x_new, x, d_sum);
}
checkCudaErrors(cudaMemcpyAsync(&sum, d_sum, sizeof(double),
cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamEndCapture(stream, &graph));
if (graphExec == NULL) {
checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
} else {
cudaGraphExecUpdateResult updateResult_out;
checkCudaErrors(
cudaGraphExecUpdate(graphExec, graph, NULL, &updateResult_out));
if (updateResult_out != cudaGraphExecUpdateSuccess) {
if (graphExec != NULL) {
checkCudaErrors(cudaGraphExecDestroy(graphExec));
}
printf("k = %d graph update failed with error - %d\n", k,
updateResult_out);
checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0));
}
}
checkCudaErrors(cudaGraphLaunch(graphExec, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
class MyCudaGraph {
public:
CudaGraph()
: graph_(nullptr),
graph_instance_(nullptr),
stream_(nullptr),
is_captured_(false) {
RPV_CUDA_CHECK(cudaGraphCreate(&graph_, 0));
}
~CudaGraph() {
if (graph_ != nullptr) {
RPV_CUDA_CHECK(cudaGraphDestroy(graph_));
}
if (graph_instance_ != nullptr) {
RPV_CUDA_CHECK(cudaGraphExecDestroy(graph_instance_));
}
}
void set_stream(const cudaStream_t& stream) {
stream_ = stream; }
const cudaGraph_t& graph() const {
return graph_; }
const cudaGraphExec_t& graph_instance() const {
return graph_instance_; }
void CaptureStart(<