cuDNN
cuDNN是英伟达提供的闭源库,对CUDA程序进行封装。cuDNN里面的前向传播和后向传播分别是独立的函数,不像利用python那样简单,只要搭建网络框架就好了。这里只介绍如何搭建前向传播模块,也就是载入我们用python程序训练好的参数进行预测。
代码链接 GitHub
cuDNN Operations for VGG16
下面程序是对 cuDNN 里面的卷积操作进行封装。cuDNN 每个 operation 基本都是要进行 descriptor、setdiscriptor、实例化 discriptor、以及内存,最后从 cpu 拷贝 到 gpu 进行底层的矩阵据计算。
void conv_foward_layer(
cudnnHandle_t& handle,
const int& batch_size,
const int& x_channels,
const int& x_height,
const int& x_width,
const void *x,
const int& kernel_size,
const void *kernel,
const void *bias,
const int& padding,
const int& stride,
const int& y_channels,
const int& y_height,
const int& y_width,
const size_t& y_bytes,
void *y,
int activate_type)
{
// 输入张量的描述
cudnnTensorDescriptor_t input_descriptor;
checkCUDNN(cudnnCreateTensorDescriptor(&input_descriptor));
checkCUDNN(cudnnSetTensor4dDescriptor(input_descriptor,
/*format=*/CUDNN_TENSOR_NHWC, // 注意是 NHWC,TensorFlow更喜欢以 NHWC 格式存储张量(通道是变化最频繁的地方,即 BGR),而其他一些更喜欢将通道放在前面
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/batch_size,
/*channels=*/x_channels,
/*image_height=*/x_height,
/*image_width=*/x_width));
// 卷积核的描述(形状、格式)
cudnnFilterDescriptor_t kernel_descriptor;
checkCUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor));
checkCUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor,
/*dataType=*/CUDNN_DATA_FLOAT,
/*format=*/CUDNN_TENSOR_NCHW, // 注意是 NCHW
///*format=*/CUDNN_TENSOR_NHWC, // 注意是 NHWC ?????????????
/*out_channels=*/y_channels,
/*in_channels=*/x_channels,
/*kernel_height=*/kernel_size,
/*kernel_width=*/kernel_size));
// 卷积操作的描述(步长、填充等等)
cudnnConvolutionDescriptor_t convolution_descriptor;
checkCUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor));
checkCUDNN(cudnnSetConvolution2dDescriptor(convolution_descriptor,
/*pad_height=*/padding,
/*pad_width=*/padding,
/*vertical_stride=*/stride,
/*horizontal_stride=*/stride,
/*dilation_height=*/1,
/*dilation_width=*/1,
/*mode=*/CUDNN_CROSS_CORRELATION, // CUDNN_CONVOLUTION
/*computeType=*/CUDNN_DATA_FLOAT));
// 输出张量的描述
cudnnTensorDescriptor_t output_descriptor;
checkCUDNN(cudnnCreateTensorDescriptor(&output_descriptor));
checkCUDNN(cudnnSetTensor4dDescriptor(output_descriptor,
/*format=*/CUDNN_TENSOR_NHWC,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/batch_size,
/*channels=*/y_channels,
/*image_height=*/y_height,
/*image_width=*/y_width));
// 卷积算法的描述
// cudnn_tion_fwd_algo_gemm——将卷积建模为显式矩阵乘法,
// cudnn_tion_fwd_algo_fft——它使用快速傅立叶变换(FFT)进行卷积或
// cudnn_tion_fwd_algo_winograd——它使用Winograd算法执行卷积。
cudnnConvolutionFwdAlgo_t convolution_algorithm;
checkCUDNN(
cudnnGetConvolutionForwardAlgorithm(handle,
input_descriptor,
kernel_descriptor,
convolution_descriptor,
output_descriptor,
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, // CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT(在内存受限的情况下,memoryLimitInBytes 设置非 0 值)
/*memoryLimitInBytes=*/0,
&convolution_algorithm));
// 计算 cuDNN 它的操作需要多少内存
size_t workspace_bytes{ 0 };
checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(handle,
input_descriptor,
kernel_descriptor,
convolution_descriptor,
output_descriptor,
convolution_algorithm,
&workspace_bytes));
if (workspace_bytes == 0) workspace_bytes = 1024 * 1048576.0;
std::cerr << "Workspace size: " << (workspace_bytes / 1048576.0) << "MB"
<< std::endl;
assert(workspace_bytes > 0);
// 分配内存, 从 cudnnGetConvolutionForwardWorkspaceSize 计算而得
void* y_workspace = nullptr;
cudaMalloc(&y_workspace, workspace_bytes);
// 从 cudnnGetConvolution2dForwardOutputDim 计算而得
//size_t y_bytes = batch_size * y_channels * y_height * y_width * sizeof(float);
//cudaMalloc(&y, y_bytes);
//cudaMemset(y, 0, y_bytes);
// 真正的卷积操作 !!!前向卷积
const float alpha = 1.0f, beta = 0.0f;
checkCUDNN(cudnnConvolutionForward(handle,
&alpha,
input_descriptor,
x,
kernel_descriptor,
kernel,
convolution_descriptor,
convolution_algorithm,
y_workspace, // 注意,如果我们选择不需要额外内存的卷积算法,d_workspace可以为nullptr。
y_bytes,
&beta,
output_descriptor,
y));
// add conv bias
if (bias != nullptr)
{
cudnnTensorDescriptor_t bias_descriptor;
checkCUDNN(cudnnCreateTensorDescriptor(&bias_descriptor));
checkCUDNN(cudnnSetTensor4dDescriptor(bias_descriptor,
/*format=*/CUDNN_TENSOR_NHWC,
/*dataType=*/CUDNN_DATA_FLOAT,
/*batch_size=*/1,
/*channels=*/y_channels,
/*bias_height=*/1,
/*bias_width=*/1));
checkCUDNN(cudnnAddTensor(
handle,
&alpha,
bias_descriptor,
bias,
&beta,
output_descriptor,
y));
cudnnDestroyTensorDescriptor(bias_descriptor);
}
// activate layer
if (activate_type != -1) {
// 描述激活
cudnnActivationDescriptor_t activation_descriptor;
checkCUDNN(cudnnCreateActivationDescriptor(&activation_descriptor));
switch (activate_type)
{
case CUDNN_ACTIVATION_SIGMOID:
checkCUDNN(cudnnSetActivationDescriptor(activation_descriptor,
CUDNN_ACTIVATION_SIGMOID,
CUDNN_PROPAGATE_NAN,
/*relu_coef=*/0));
break;
case CUDNN_ACTIVATION_RELU:
checkCUDNN(cudnnSetActivationDescriptor(activation_descriptor,
CUDNN_ACTIVATION_RELU,
CUDNN_PROPAGATE_NAN,
/*relu_coef=*/0));
break;
default:
break;
}
// 前向 sigmoid 激活函数
checkCUDNN(cudnnActivationForward(handle,
activation_descriptor,
&alpha,
output_descriptor,
y,
&beta,
output_descriptor,
y));
cudnnDestroyActivationDescriptor(activation_descriptor);
}
/*if (x_channels == y_channels && y_height == x_height)
{
cudnnAddTensor(
handle,
&alpha,
input_descriptor,
x,
&beta,
output_descriptor,
y);
}*/
/*float* y_output = new float[y_bytes];
cudaMemcpy(y_output, y, y_bytes, cudaMemcpyDeviceToHost);
save_image("./cv.png", y_output, y_height, y_width);*/
cudaFree(y_workspace);
cudnnDestroyTensorDescriptor(input_descriptor);
cudnnDestroyTensorDescriptor(output_descriptor);
cudnnDestroyFilterDescriptor(kernel_descriptor);
cudnnDestroyConvolutionDescriptor(convolution_descriptor);
}
cuDNN 里面没有全连接层这个接口函数,但是我们可以利用 1x1 的卷积核进行操作,细节如下,是对 VGG16 最后三层全连接层的操作。
for (int fc_layer = 0; fc_layer < 3; fc_layer++)
{
// 计算输出张量的维度及内存大小
out_height = 1;
out_width = 1;
out_bytes = batch_size * fc_sizes[fc_layer] * out_height * out_width * sizeof(float);
cudaMalloc(&output_tensor, out_bytes);
cudaMemcpy(output_tensor, 0, out_bytes, cudaMemcpyHostToDevice);
// *************************************************************************
// load fully connected paras
float* d_fc_kernel = nullptr;
cudaMalloc(&d_fc_kernel, sizeof(float) * fc_kernels[fc_layer].size() * in_height * in_width);
cudaMemcpy(d_fc_kernel, fc_kernels[fc_layer].data(), sizeof(float) * fc_kernels[fc_layer].size() * in_height * in_width, cudaMemcpyHostToDevice);
//load fully connected bias
float* d_fc_bias = nullptr;
cudaMalloc(&d_fc_bias, sizeof(float)*fc_bias[fc_layer].size());
cudaMemcpy(d_fc_bias, fc_bias[fc_layer].data(), sizeof(float) * fc_bias[fc_layer].size(), cudaMemcpyHostToDevice);
// *************************************************************************
// *************************************************************************
int x_channels = 0, fc_kernel_size = 0;
if (fc_layer == 0)
{
x_channels = conv_kernel_sizes[12][2];
fc_kernel_size = in_height;
}
else
{
x_channels = fc_sizes[fc_layer - 1];
fc_kernel_size = 1;
}
// perform fully connected operation
conv_foward_layer(
cudnn,
batch_size,
/*x_channels*/x_channels,
in_height,
in_width,
input_tensor,
/*kernel_size*/fc_kernel_size,
d_fc_kernel,
d_fc_bias,
0,
1,
/*y_channels*/fc_sizes[fc_layer],
out_height,
out_width,
out_bytes,
output_tensor,
1);
in_height = out_height;
in_width = out_width;
input_bytes = out_bytes;
cudaFree(input_tensor);
input_tensor = nullptr;
cudaMalloc(&input_tensor, input_bytes);
cudaMemcpy(input_tensor, output_tensor, input_bytes, cudaMemcpyHostToDevice);
cudaFree(output_tensor);
output_tensor = nullptr;
cudaFree(d_fc_kernel);
d_fc_kernel = nullptr;
cudaFree(d_fc_bias);
d_fc_bias = nullptr;
}
环境配置
vs2015
opencv>=3.0
CUDA (adaptive to your personal computer’s enviroment)
cuDNN (version related to CUDA)
参考资料
[1] 使用 CuDNN 进行卷积运算
[2] CUDNN 7.6.3 Docs