darknet源码解析:layer结构体之forward_gpu
forward_convolutional_layer_gpu()函数源代码:
void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
{
if(l.binary){
binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu);
swap_binary(&l);
}
if(l.xnor){
if (!l.align_bit_weights_gpu || state.train) {
fast_binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu, l.mean_arr_gpu);
}
if (l.align_bit_weights_gpu && !state.train && l.c >= 32)
{
int m = l.n / l.groups;
int k = l.size*l.size*l.c / l.groups;
int n = l.out_w*l.out_h;
int ldb_align = l.lda_align;
size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
if (l.c % 32 == 0)
{
//printf("\n\n l.index = %d, l.w = %d, l.c = %d, l.n = %d, l.stride = %d, l.pad = %d - new XNOR \n", l.index, l.w, l.c, l.n, l.stride, l.pad);
//printf("l.align_workspace_size = %d, (l.c * l.w * l.h) = %d \n", l.align_workspace_size, (l.c * l.w * l.h));
int ldb_align = l.lda_align;
size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
const int new_c = l.c / 32;
repack_input_gpu_bin(state.input, (uint32_t *)l.align_workspace_gpu, l.w, l.h, l.c);
im2col_ongpu(l.align_workspace_gpu, new_c, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
int new_k = l.size*l.size*l.c / 32;
//printf("\n n = %d, n % 32 = %d, new_ldb = %d, new_ldb % 32 = %d \n", n, n % 32, new_ldb, new_ldb % 32);
transpose_uint32_gpu((uint32_t *)state.workspace, (uint32_t *)l.transposed_align_workspace_gpu, new_k, n, n, new_ldb);
gemm_nn_custom_bin_mean_transposed_gpu(m, n, k,
(unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu,
new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu, l.activation == LEAKY,
l.bin_conv_shortcut_in_gpu, l.bin_conv_shortcut_out_gpu);
}
else
{
//printf("\n\n l.index = %d, l.w = %d, l.c = %d, l.n = %d, l.stride = %d, l.pad = %d - old XNOR \n", l.index, l.w, l.c, l.n, l.stride, l.pad);
int i = 0;
{
im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, l.align_workspace_gpu, l.bit_align);
float_to_bit_gpu(l.align_workspace_gpu, (unsigned char *)state.workspace, l.align_workspace_size);
}
transpose_bin_gpu((unsigned char *)state.workspace, (unsigned char *)l.transposed_align_workspace_gpu, k, n, l.bit_align, new_ldb, 8);
gemm_nn_custom_bin_mean_transposed_gpu(m, n, k,
(unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu,
new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu, l.activation == LEAKY,
l.bin_conv_shortcut_in_gpu, l.bin_conv_shortcut_out_gpu);
}
if (l.activation == SWISH) activate_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.output_sigmoid_gpu, l.output_gpu);
else if (l.activation != LINEAR && l.activation != LEAKY) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
return;
}
}
if (l.xnor) {
swap_binary(&l);
binarize_gpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input_gpu);
state.input = l.binary_input_gpu;
}
#ifdef CUDNN
float alpha = 1, beta = 0;
int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in) &&
(l.c / l.groups) % 8 == 0 && l.n % 8 == 0 && !state.train)
{
const size_t input16_size = l.batch*l.c*l.w*l.h;
const size_t output16_size = l.batch*l.out_c*l.out_h*l.out_w;
if (*state.net.max_input16_size < input16_size) {
*state.net.max_input16_size = input16_size;
if (*state.net.input16_gpu) cuda_free(*state.net.input16_gpu);
assert(*state.net.max_input16_size > 0);
*state.net.input16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size);
}
float *input16 = *state.net.input16_gpu;
if (*state.net.max_output16_size < output16_size) {
*state.net.max_output16_size = output16_size;
if (*state.net.output16_gpu) cuda_free(*state.net.output16_gpu);
assert(*state.net.max_output16_size > 0);
*state.net.output16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size);
}
float *output16 = *state.net.output16_gpu;
assert(input16_size > 0);
cuda_convert_f32_to_f16(state.input, input16_size, input16);
CHECK_CUDNN(cudnnConvolutionForward(cudnn_handle(),
&alpha,
l.srcTensorDesc16,
input16,
l.weightDesc16,
l.weights_gpu16,
l.convDesc,
l.fw_algo16,
state.workspace,
l.workspace_size,
&beta,
l.dstTensorDesc16,
output16));
if (l.batch_normalize)
{
if (state.train) // Training
{
simple_copy_ongpu(l.outputs*l.batch / 2, output16, l.x_gpu);
float one = 1.0f;
float zero = 0.0f;
CHECK_CUDNN(cudnnBatchNormalizationForwardTraining(cudnn_handle(),
CUDNN_BATCHNORM_SPATIAL,
&one,
&zero,
l.normDstTensorDescF16,
l.x_gpu, // input
l.normDstTensorDescF16,
output16, // output
l.normTensorDesc,
l.scales_gpu, // input
l.biases_gpu, // input
.01,
l.rolling_mean_gpu, // input/output (should be FP32)
l.rolling_variance_gpu, // input/output (should be FP32)
.00001,
l.mean_gpu, // output (should be FP32) - optional cache to speedup cudnnBatchNormalizationBackward()
l.variance_gpu)); // output (should be FP32) - optional cache to speedup cudnnBatchNormalizationBackward()
cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
//forward_batchnorm_layer_gpu(l, state);
}
else // Detection
{
cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
}
}
else // BIAS only
{
cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
}
}
else {
CHECK_CUDNN(cudnnConvolutionForward(cudnn_handle(),
&alpha, //&one,
l.srcTensorDesc,
state.input,
l.weightDesc,
l.weights_gpu,
l.convDesc,
l.fw_algo,
state.workspace,
l.workspace_size,
&beta, //&one,
l.dstTensorDesc,
l.output_gpu));
if (l.batch_normalize) {
forward_batchnorm_layer_gpu(l, state);
}
else {
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
}
}
#else
fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
int i, j;
int m = l.n / l.groups;
int k = l.size*l.size*l.c / l.groups;
int n = l.out_w*l.out_h;
for(i = 0; i < l.batch; ++i){
for (j = 0; j < l.groups; ++j) {
float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
float *a = l.weights_gpu + j*l.nweights / l.groups;
float *b = state.workspace;
float *c = l.output_gpu + (i*l.groups + j)*n*m;
if (l.size == 1) {
b = im;
}
else {
im2col_gpu_ext(im, // input
l.c / l.groups, // input channels
l.h, l.w, // input size (h, w)
l.size, l.size, // kernel size (h, w)
l.pad, l.pad, // padding (h, w)
l.stride, l.stride, // stride (h, w)
l.dilation, l.dilation, // dilation (h, w)
state.workspace); // output
}
//gemm_ongpu(0, 0, m, n, k, 1., a, k, b, n, 1., c + i*m*n, n);
gemm_ongpu(0, 0, m, n, k, 1, a, k, b, n, 1, c, n);
}
}
if (l.batch_normalize) {
forward_batchnorm_layer_gpu(l, state);
}
else {
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
}
#endif
if (l.activation == SWISH) activate_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.output_sigmoid_gpu, l.output_gpu);
else if (l.activation != LINEAR) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
if(l.binary || l.xnor) swap_binary(&l);
if (state.net.try_fix_nan) {
fix_nan_and_inf(l.output_gpu, l.outputs*l.batch);
}
}
333