使用 CuDNN 进行卷积运算【读书笔记】

最新推荐文章于 2024-07-05 00:01:39 发布

ShaderJoy

最新推荐文章于 2024-07-05 00:01:39 发布

阅读量6.5k

点赞数 3

分类专栏： CUDA DL

本文链接：https://blog.csdn.net/panda1234lee/article/details/83154504

版权

CUDA 同时被 2 个专栏收录

47 篇文章 14 订阅

订阅专栏

22 篇文章 0 订阅

订阅专栏

原文链接 http://www.goldsborough.me/cuda/ml/cudnn/c++/2017/10/01/14-37-23-convolutions_with_cudnn/

以下为长截图，CSDN 限定了图片长度，请点击查看原图


#include <cudnn.h>

// http://www.goldsborough.me/cuda/ml/cudnn/c++/2017/10/01/14-37-23-convolutions_with_cudnn/

#define checkCUDNN(expression)                                  \
  {                                                             \
    cudnnStatus_t status = (expression);                        \
    if (status != CUDNN_STATUS_SUCCESS) {                       \
	    std::cerr << "Error on line " << __LINE__ << ": "       \
	    << cudnnGetErrorString(status) << std::endl;            \
	    std::exit(EXIT_FAILURE);                                \
    }                                                           \
 }

int main(int argc, const char* argv[]) {
	if (argc < 2) {
		std::cerr << "usage: conv <image> [gpu=0] [sigmoid=0]" << std::endl;
		std::exit(EXIT_FAILURE);
	}

	int gpu_id = (argc > 2) ? std::atoi(argv[2]) : 0;
	std::cerr << "GPU: " << gpu_id << std::endl;

	bool with_sigmoid = (argc > 3) ? std::atoi(argv[3]) : 0;
	std::cerr << "With sigmoid: " << std::boolalpha << with_sigmoid << std::endl;

	cv::Mat image = load_image(argv[1]);

	cudaSetDevice(gpu_id);

	cudnnHandle_t cudnn;
	cudnnCreate(&cudnn);

	// 输入张量的描述
	cudnnTensorDescriptor_t input_descriptor;
	checkCUDNN(cudnnCreateTensorDescriptor(&input_descriptor));
	checkCUDNN(cudnnSetTensor4dDescriptor(input_descriptor,
		/*format=*/CUDNN_TENSOR_NHWC,	// 注意是 NHWC，TensorFlow更喜欢以 NHWC 格式存储张量(通道是变化最频繁的地方，即 BGR)，而其他一些更喜欢将通道放在前面
		/*dataType=*/CUDNN_DATA_FLOAT,
		/*batch_size=*/1,
		/*channels=*/3,
		/*image_height=*/image.rows,
		/*image_width=*/image.cols));

	// 卷积核的描述（形状、格式）
	cudnnFilterDescriptor_t kernel_descriptor;
	checkCUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor));
	checkCUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor,
		/*dataType=*/CUDNN_DATA_FLOAT,
		/*format=*/CUDNN_TENSOR_NCHW,	// 注意是 NCHW
		/*out_channels=*/3,
		/*in_channels=*/3,
		/*kernel_height=*/3,
		/*kernel_width=*/3));

	// 卷积操作的描述（步长、填充等等）
	cudnnConvolutionDescriptor_t convolution_descriptor;
	checkCUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor));
	checkCUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor,
		/*pad_height=*/1,
		/*pad_width=*/1,
		/*vertical_stride=*/1,
		/*horizontal_stride=*/1,
		/*dilation_height=*/1,
		/*dilation_width=*/1,
		/*mode=*/CUDNN_CROSS_CORRELATION, // CUDNN_CONVOLUTION
		/*computeType=*/CUDNN_DATA_FLOAT));

	// 计算卷积后图像的维数
	int batch_size{ 0 }, channels{ 0 }, height{ 0 }, width{ 0 };
	checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convolution_descriptor,
		input_descriptor,
		kernel_descriptor,
		&batch_size,
		&channels,
		&height,
		&width));

	std::cerr << "Output Image: " << height << " x " << width << " x " << channels
		<< std::endl;

	// 卷积输出张量的描述
	cudnnTensorDescriptor_t output_descriptor;
	checkCUDNN(cudnnCreateTensorDescriptor(&output_descriptor));
	checkCUDNN(cudnnSetTensor4dDescriptor(output_descriptor,
		/*format=*/CUDNN_TENSOR_NHWC,
		/*dataType=*/CUDNN_DATA_FLOAT,
		/*batch_size=*/1,
		/*channels=*/3,
		/*image_height=*/image.rows,
		/*image_width=*/image.cols));

	// 卷积算法的描述
	// cudnn_tion_fwd_algo_gemm——将卷积建模为显式矩阵乘法，
	// cudnn_tion_fwd_algo_fft——它使用快速傅立叶变换(FFT)进行卷积或
	// cudnn_tion_fwd_algo_winograd——它使用Winograd算法执行卷积。
	cudnnConvolutionFwdAlgo_t convolution_algorithm;
	checkCUDNN(
		cudnnGetConvolutionForwardAlgorithm(cudnn,
		input_descriptor,
		kernel_descriptor,
		convolution_descriptor,
		output_descriptor,
		CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, // CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT（在内存受限的情况下，memoryLimitInBytes 设置非 0 值）
		/*memoryLimitInBytes=*/0,
		&convolution_algorithm));

	// 计算 cuDNN 它的操作需要多少内存
	size_t workspace_bytes{ 0 };
	checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn,
		input_descriptor,
		kernel_descriptor,
		convolution_descriptor,
		output_descriptor,
		convolution_algorithm,
		&workspace_bytes));
	std::cerr << "Workspace size: " << (workspace_bytes / 1048576.0) << "MB"
		<< std::endl;
	assert(workspace_bytes > 0);

	// *************************************************************************
	// 分配内存， 从 cudnnGetConvolutionForwardWorkspaceSize 计算而得
	void* d_workspace{ nullptr };
	cudaMalloc(&d_workspace, workspace_bytes);

	// 从 cudnnGetConvolution2dForwardOutputDim 计算而得
	int image_bytes = batch_size * channels * height * width * sizeof(float);

	float* d_input{ nullptr };
	cudaMalloc(&d_input, image_bytes);
	cudaMemcpy(d_input, image.ptr<float>(0), image_bytes, cudaMemcpyHostToDevice);

	float* d_output{ nullptr };
	cudaMalloc(&d_output, image_bytes);
	cudaMemset(d_output, 0, image_bytes);
	// *************************************************************************
	// clang-format off
	const float kernel_template[3][3] = {
		{ 1, 1, 1 },
		{ 1, -8, 1 },
		{ 1, 1, 1 }
	};
	// clang-format on

	float h_kernel[3][3][3][3]; // NCHW
	for (int kernel = 0; kernel < 3; ++kernel) {
		for (int channel = 0; channel < 3; ++channel) {
			for (int row = 0; row < 3; ++row) {
				for (int column = 0; column < 3; ++column) {
					h_kernel[kernel][channel][row][column] = kernel_template[row][column];
				}
			}
		}
	}

	float* d_kernel{ nullptr };
	cudaMalloc(&d_kernel, sizeof(h_kernel));
	cudaMemcpy(d_kernel, h_kernel, sizeof(h_kernel), cudaMemcpyHostToDevice);
	// *************************************************************************

	const float alpha = 1.0f, beta = 0.0f;

	// 真正的卷积操作 ！！！前向卷积
	checkCUDNN(cudnnConvolutionForward(cudnn,
		&alpha,
		input_descriptor,
		d_input,
		kernel_descriptor,
		d_kernel,
		convolution_descriptor,
		convolution_algorithm,
		d_workspace, // 注意，如果我们选择不需要额外内存的卷积算法，d_workspace可以为nullptr。
		workspace_bytes,
		&beta,
		output_descriptor,
		d_output));

	if (with_sigmoid) {
		
		// 描述激活
		cudnnActivationDescriptor_t activation_descriptor;
		checkCUDNN(cudnnCreateActivationDescriptor(&activation_descriptor));
		checkCUDNN(cudnnSetActivationDescriptor(activation_descriptor,
			CUDNN_ACTIVATION_SIGMOID,
			CUDNN_PROPAGATE_NAN,
			/*relu_coef=*/0));

		// 前向 sigmoid 激活函数
		checkCUDNN(cudnnActivationForward(cudnn,
			activation_descriptor,
			&alpha,
			output_descriptor,
			d_output,
			&beta,
			output_descriptor,
			d_output));
		cudnnDestroyActivationDescriptor(activation_descriptor);
	}

	float* h_output = new float[image_bytes];
	cudaMemcpy(h_output, d_output, image_bytes, cudaMemcpyDeviceToHost);

	save_image("../cudnn-out.png", h_output, height, width);

	delete[] h_output;
	cudaFree(d_kernel);
	cudaFree(d_input);
	cudaFree(d_output);
	cudaFree(d_workspace);

	// 销毁
	cudnnDestroyTensorDescriptor(input_descriptor);
	cudnnDestroyTensorDescriptor(output_descriptor);
	cudnnDestroyFilterDescriptor(kernel_descriptor);
	cudnnDestroyConvolutionDescriptor(convolution_descriptor);

	cudnnDestroy(cudnn);
}