第一个错误
Found cuDNN: ver. ??? found (include: /usr/local/cuda-11.4/include, library: /usr/local/cuda-11.4/lib64/libcudnn.so)
解决方法
搜索Cuda.cmake
file(READ ${CUDNN_INCLUDE}/cudnn_version.h CUDNN_VERSION_FILE_CONTENTS)
#file(READ ${CUDNN_INCLUDE}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
第二个错误
没有那个文件或目录
解决方法
在~/sofeware/cudnn-11.4/cuda/include文件夹下执行
sudo cp cudnn_ops_infer.h /usr/local/cuda-11.4/include
sudo cp cudnn_ops_train.h /usr/local/cuda-11.4/include
sudo cp cudnn_adv_infer.h /usr/local/cuda-11.4/include
sudo cp cudnn_adv_train.h /usr/local/cuda-11.4/include
sudo cp cudnn_cnn_infer.h /usr/local/cuda-11.4/include
sudo cp cudnn_cnn_train.h /usr/local/cuda-11.4/include
sudo cp cudnn_backend.h /usr/local/cuda-11.4/include
第三修改cudnn_conv_layer.cpp文件
替换内容
template <typename Dtype>
void CuDNNConvolutionLayer<Dtype>::Reshape(
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
ConvolutionLayer<Dtype>::Reshape(bottom, top);
CHECK_LE(2, this->num_spatial_axes_)
<< "CuDNNConvolution input must have 2 spatial axes "
<< "(e.g., height and width). "
<< "Use 'engine: CAFFE' for general ND convolution.";
bottom_offset_ = this->bottom_dim_ / this->group_;
top_offset_ = this->top_dim_ / this->group_;
const int height = bottom[0]->shape(this->channel_axis_ + 1);
const int width = bottom[0]->shape(this->channel_axis_ + 2);
const int height_out = top[0]->shape(this->channel_axis_ + 1);
const int width_out = top[0]->shape(this->channel_axis_ + 2);
const int* pad_data = this->pad_.cpu_data();
const int pad_h = pad_data[0];
const int pad_w = pad_data[1];
const int* stride_data = this->stride_.cpu_data();
const int stride_h = stride_data[0];
const int stride_w = stride_data[1];
//#if CUDNN_VERSION_MIN(8, 0, 0)
#if 1
int RetCnt;
bool found_conv_algorithm;
size_t free_memory, total_memory;
cudnnConvolutionFwdAlgoPerf_t fwd_algo_pref_[4];
cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo_pref_[4];
//get memory sizes
cudaMemGetInfo(&free_memory, &total_memory);
#else
// Specify workspace limit for kernels directly until we have a
// planning strategy and a rewrite of Caffe's GPU memory mangagement
size_t workspace_limit_bytes = 8*1024*1024;
#endif
for (int i = 0; i < bottom.size(); i++) {
cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
this->num_,
this->channels_ / this->group_, height, width,
this->channels_ * height * width,
height * width, width, 1);
cudnn::setTensor4dDesc<Dtype>(&top_descs_[i],
this->num_,
this->num_output_ / this->group_, height_out, width_out,
this->num_output_ * this->out_spatial_dim_,
this->out_spatial_dim_, width_out, 1);
cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i], bottom_descs_[i],
filter_desc_, pad_h, pad_w,
stride_h, stride_w);
//#if CUDNN_VERSION_MIN(8, 0, 0)
#if 1
// choose forward algorithm for filter
// in forward filter the CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED is not implemented in cuDNN 8
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(handle_[0],
bottom_descs_[i],
filter_desc_,
conv_descs_[i],
top_descs_[i],
4,
&RetCnt,
fwd_algo_pref_));
found_conv_algorithm = false;
for(int n=0;n<RetCnt;n++){
if (fwd_algo_pref_[n].status == CUDNN_STATUS_SUCCESS &&
fwd_algo_pref_[n].algo != CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
fwd_algo_pref_[n].memory < free_memory){
found_conv_algorithm = true;
fwd_algo_[i] = fwd_algo_pref_[n].algo;
workspace_fwd_sizes_[i] = fwd_algo_pref_[n].memory;
break;
}
}
if(!found_conv_algorithm) LOG(ERROR) << "cuDNN did not return a suitable algorithm for convolution.";
else{
// choose backward algorithm for filter
// for better or worse, just a fixed constant due to the missing
// cudnnGetConvolutionBackwardFilterAlgorithm in cuDNN version 8.0
bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
//twice the amount of the forward search to be save
workspace_bwd_filter_sizes_[i] = 2*workspace_fwd_sizes_[i];
}
// choose backward algo for data
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm_v7(handle_[0],
filter_desc_,
top_descs_[i],
conv_descs_[i],
bottom_descs_[i],
4,
&RetCnt,
bwd_data_algo_pref_));
found_conv_algorithm = false;
for(int n=0;n<RetCnt;n++){
if (bwd_data_algo_pref_[n].status == CUDNN_STATUS_SUCCESS &&
bwd_data_algo_pref_[n].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD &&
bwd_data_algo_pref_[n].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED &&
bwd_data_algo_pref_[n].memory < free_memory){
found_conv_algorithm = true;
bwd_data_algo_[i] = bwd_data_algo_pref_[n].algo;
workspace_bwd_data_sizes_[i] = bwd_data_algo_pref_[n].memory;
break;
}
}
if(!found_conv_algorithm) LOG(ERROR) << "cuDNN did not return a suitable algorithm for convolution.";
#else
// choose forward and backward algorithms + workspace(s)
CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[0],
bottom_descs_[i],
filter_desc_,
conv_descs_[i],
top_descs_[i],
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_limit_bytes,
&fwd_algo_[i]));
CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[0],
bottom_descs_[i],
filter_desc_,
conv_descs_[i],
top_descs_[i],
fwd_algo_[i],
&(workspace_fwd_sizes_[i])));
// choose backward algorithm for filter
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(handle_[0],
bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_limit_bytes, &bwd_filter_algo_[i]) );
// get workspace for backwards filter algorithm
CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(handle_[0],
bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i]));
// choose backward algo for data
CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(handle_[0],
filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_limit_bytes, &bwd_data_algo_[i]));
// get workspace size
CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(handle_[0],
filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
bwd_data_algo_[i], &workspace_bwd_data_sizes_[i]) );
#endif
}
// reduce over all workspace sizes to get a maximum to allocate / reallocate
size_t total_workspace_fwd = 0;
size_t total_workspace_bwd_data = 0;
size_t total_workspace_bwd_filter = 0;
for (size_t i = 0; i < bottom.size(); i++) {
total_workspace_fwd = std::max(total_workspace_fwd,
workspace_fwd_sizes_[i]);
total_workspace_bwd_data = std::max(total_workspace_bwd_data,
workspace_bwd_data_sizes_[i]);
total_workspace_bwd_filter = std::max(total_workspace_bwd_filter,
workspace_bwd_filter_sizes_[i]);
}
// get max over all operations
size_t max_workspace = std::max(total_workspace_fwd,
total_workspace_bwd_data);
max_workspace = std::max(max_workspace, total_workspace_bwd_filter);
// ensure all groups have enough workspace
size_t total_max_workspace = max_workspace *
(this->group_ * CUDNN_STREAMS_PER_GROUP);
// this is the total amount of storage needed over all groups + streams
if (total_max_workspace > workspaceSizeInBytes) {
DLOG(INFO) << "Reallocating workspace storage: " << total_max_workspace;
workspaceSizeInBytes = total_max_workspace;
// free the existing workspace and allocate a new (larger) one
cudaFree(this->workspaceData);
cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes);
if (err != cudaSuccess) {
// force zero memory path
for (int i = 0; i < bottom.size(); i++) {
workspace_fwd_sizes_[i] = 0;
workspace_bwd_filter_sizes_[i] = 0;
workspace_bwd_data_sizes_[i] = 0;
fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
// NULL out all workspace pointers
for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
workspace[g] = NULL;
}
// NULL out underlying data
workspaceData = NULL;
workspaceSizeInBytes = 0;
}
// if we succeed in the allocation, set pointer aliases for workspaces
for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
workspace[g] = reinterpret_cast<char *>(workspaceData) + g*max_workspace;
}
}
// Tensor descriptor for bias.
if (this->bias_term_) {
cudnn::setTensor4dDesc<Dtype>(&bias_desc_,
1, this->num_output_ / this->group_, 1, 1);
}
}