YOLOv2源码分析（二）

最新推荐文章于 2023-01-02 22:34:11 发布

一个新新的小白

最新推荐文章于 2023-01-02 22:34:11 发布

阅读量531

点赞数

分类专栏：算法

算法专栏收录该内容

80 篇文章 2 订阅

订阅专栏

文章全部YOLOv2源码分析

接着上一讲没有讲完的make_convolutional_layer函数

0x01 make_convolutional_layer

    //make_convolutional_layer
    l.forward = forward_convolutional_layer;
    l.backward = backward_convolutional_layer;
    l.update = update_convolutional_layer;    
 
 1
2
3
4

上来就是三坐大山^_^，我们先从第一个forward_convolutional_layer开始。

0x0101 forward_convolutional_layer

void forward_convolutional_layer(convolutional_layer l, network net)
{//传入卷积层参数和网络的总参数
    int i, j;

    fill_cpu(l.outputs*l.batch, 0, l.output, 1);
 
 1
2
3
4
5

看这个fill_cpu函数

void fill_cpu(int N, float ALPHA, float *X, int INCX)
{
    int i;
    for(i = 0; i < N; ++i) X[i*INCX] = ALPHA;
}
 
 1
2
3
4
5

输入的参数N表示一个batch中所有的图像元素个数，x指向n对应大小分配的内存空间。整个函数来看就是对输出图像元素的一个初始化操作。

接着看后面

    //forward_convolutional_layer
    if(l.xnor){
        binarize_weights(l.weights, l.n, l.c/l.groups*l.size*l.size, l.binary_weights);
        swap_binary(&l);
        binarize_cpu(net.input, l.c*l.h*l.w*l.batch, l.binary_input);
        net.input = l.binary_input;
    }    
 
 1
2
3
4
5
6
7

判断是否二值化操作，如果是的话，其中有两个关键的函数binarize_weights和binarize_cpu

void binarize_weights(float *weights, int n, int size, float *binary)
{
    int i, f;
    for(f = 0; f < n; ++f){
        float mean = 0;
        for(i = 0; i < size; ++i){
            mean += fabs(weights[f*size + i]);
        }
        mean = mean / size;
        for(i = 0; i < size; ++i){
            binary[f*size + i] = (weights[f*size + i] > 0) ? mean : -mean;
        }
    }
}
 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14

第一个参数就是指向分配给weight内存空间的指针，第二参数是卷积核个数，第三个参数是一个卷积核weight的个数（这里应该使用l.nweights/l.n），第四个参数是指向分配给二值化weight内存空间的指针。举个例子

假设有两个2x2卷积核
n=2  size=4
权重值总共8个 1 2 3 4 5 6 7 8

第一次循环 f=0 
mean = 1+2+3+4 = 10
mean/4 = 2.5
binary[0]=2.5 binary[1]=2.5 binary[2]=2.5 binary[3]=2.5

第二次循环 f=1
mean = 5+6+7+8 = 26
mean/4 = 6.5
binary[0]=6.5 binary[1]=6.5 binary[2]=6.5 binary[3]=6.5
 
 1
2
3
4
5
6
7
8
9
10
11
12
13

接着看后面的swap_binary函数

void swap_binary(convolutional_layer *l)
{
    float *swap = l->weights;
    l->weights = l->binary_weights;
    l->binary_weights = swap;

#ifdef GPU
    swap = l->weights_gpu;
    l->weights_gpu = l->binary_weights_gpu;
    l->binary_weights_gpu = swap;
#endif
}
 
 1
2
3
4
5
6
7
8
9
10
11
12

函数的作用很明显了，就要把以前的权重值替换二值化后的

接着binarize_cpu函数

void binarize_cpu(float *input, int n, float *binary)
{
    int i;
    for(i = 0; i < n; ++i){
        binary[i] = (input[i] > 0) ? 1 : -1;
    }
}
 
 1
2
3
4
5
6
7

函数的第一个参数指向输入图像内存空间的指针，函数第二个参数表示一个batch的图像元素个数，函数第三个参数指向分配给二值化input内存空间的指针。

函数很简单，总体来看函数的作用就是出入图像的二值化。

最后将得到的二值化输入图像赋值给原来的输入图像。

我们接着回到forward_convolutional_layer函数

    //forward_convolutional_layer
    int m = l.n/l.groups;//一个group的卷积核个数
    int k = l.size*l.size*l.c/l.groups;//一个group的卷积核元素个数
    int n = l.out_w*l.out_h;//一个输出图像的元素个数
    for(i = 0; i < l.batch; ++i){
        for(j = 0; j < l.groups; ++j){
            float *a = l.weights + j*l.nweights/l.groups;
            float *b = net.workspace;
            float *c = l.output + (i*l.groups + j)*n*m;

            im2col_cpu(net.input + (i*l.groups + j)*l.c/l.groups*l.h*l.w,
                l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
            gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
        }
    }

 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

这里有两个非常重要的函数im2col_cpu和gemm。先看第一个

0x0102 im2col_cpu && gemm

float im2col_get_pixel(float *im, int height, int width, int channels,
                        int row, int col, int channel, int pad)
{
    row -= pad;
    col -= pad;

    if (row < 0 || col < 0 ||
        row >= height || col >= width) return 0;
    return im[col + width*(row + height*channel)];
}

//From Berkeley Vision's Caffe!
//https://github.com/BVLC/caffe/blob/master/LICENSE
void im2col_cpu(float* data_im,
     int channels,  int height,  int width,
     int ksize,  int stride, int pad, float* data_col) 
{
    int c,h,w;
    int height_col = (height + 2*pad - ksize) / stride + 1;//卷积后的高度
    int width_col = (width + 2*pad - ksize) / stride + 1;//卷积后的宽度

    int channels_col = channels * ksize * ksize;
    for (c = 0; c < channels_col; ++c) {
        int w_offset = c % ksize;
        int h_offset = (c / ksize) % ksize;
        int c_im = c / ksize / ksize;
        for (h = 0; h < height_col; ++h) {
            for (w = 0; w < width_col; ++w) {
                int im_row = h_offset + h * stride;
                int im_col = w_offset + w * stride;
                int col_index = (c * height_col + h) * width_col + w;
                data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
                        im_row, im_col, c_im, pad);
            }
        }
    }
}
 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

这个函数是参考了早期caffe中的设计，但是现在caffe好像有了新的做法。首先说说这个函数的参数

data_im:指向输入数据的指针
channels:一个卷积组的通道数
height:输入图像的高
width:输入图像的宽
ksize:卷积核的大小
stride:步长大小
pad:pad大小
data_col:指向数据转化后的内存空间

这个函数比较复杂，还是举个例子说明

我们假设输入图片大小3x3，pad=1，stride=2，卷积核大小3x3，channels=1
0 0 0 0 0
0 1 2 3 0
0 4 5 6 0
0 7 8 9 0
0 0 0 0 0
height_col = (3+2-3)/2+1 = 2
width_col = (3+2-3)/2+1 = 2
channels = 1*3*3 = 9
进入第一个循环c = 0
w_offset = 0
h_offset = 0
c_im = 0

h=0    w=0
im_row = 0
im_col = 0
col_index = 0
data_col[0] = 0

h=0    w=1
im_row = 0
im_col = 2
col_index = 1
data_col[1] = 0
...
data_col[2]=0 data_col[3]=5 
data_col[4]=0 data_col[5]=0 data_col[6]=4 data_col[7]=6
...

0 0 0 0 0     
0 1 2 3 0     
0 4 5 6 0 ==>  0 0 0 5 0 0 4 6 0 0 5 0 0 2 0 8 1 3 7 9 2 0 8 0 0 5 0 0 4 6 0 0 5 0 0 0
0 7 8 9 0     
0 0 0 0 0
 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

翻译成人能看得懂的就是

这个矩阵有什么特殊的含义呢?

我们不难发现，这个矩阵的每一列就表示卷积核对应的一个小窗口，例如第一个窗口0 0 0 0 1 2 0 4 5，很有意思是不是？

接着我们再来看看这个gemm函数

void gemm(int TA, int TB, int M, int N, int K, float ALPHA, 
        float *A, int lda, 
        float *B, int ldb,
        float BETA,
        float *C, int ldc)
{
    gemm_cpu( TA,  TB,  M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
}
void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA, 
        float *A, int lda, 
        float *B, int ldb,
        float BETA,
        float *C, int ldc)
{
    //printf("cpu: %d %d %d %d %d %f %d %d %f %d\n",TA, TB, M, N, K, ALPHA, lda, ldb, BETA, ldc);
    int i, j;
    for(i = 0; i < M; ++i){
        for(j = 0; j < N; ++j){
            C[i*ldc + j] *= BETA;//因为前面的BETA是1，所以这里我们也不关心了
        }
    }
    if(!TA && !TB)
        gemm_nn(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
    else if(TA && !TB)
        gemm_tn(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
    else if(!TA && TB)
        gemm_nt(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
    else
        gemm_tt(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
}
void gemm_nn(int M, int N, int K, float ALPHA, 
        float *A, int lda, 
        float *B, int ldb,
        float *C, int ldc)
{
    int i,j,k;
    #pragma omp parallel for
    for(i = 0; i < M; ++i){
        for(k = 0; k < K; ++k){
            register float A_PART = ALPHA*A[i*lda+k];
            for(j = 0; j < N; ++j){
                C[i*ldc+j] += A_PART*B[k*ldb+j];
            }
        }
    }
}
 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46

由于gemm前面传入的参数是0,0，所以我这里只看gemm_nn这个函数，其他函数操作相似，不再赘述。

我们还是先看看这个函数的参数

M: A的行数
N: B的列数
K: A的列数
ALPHA:系数
A:指向矩阵a的指针
lda: a的列数
B:指向矩阵b的指针
ldb: b的列数
C:指向矩阵c的指针
ldc: c的列数

我们知道这里A就是输入weight的矩阵，B就是我们前面im2col_cpu中得到的输出矩阵，C用来存储我们最后得到的矩阵（其实是一个数组，前面说的矩阵也是）。M 一个group的卷积核个数，K一个group的卷积核元素个数，N 一个输出图像的元素个数，lda一个group的卷积核元素个数，ldb一个输出图像的元素个数，ldc一个输出图像的元素个数。

我们还是举个例子说明

这里我们假设卷积核还是3x3
权重矩阵A为
1 2 3
4 5 6  ==> 1 2 3 4 5 6 7 8 9（应该这样写）
7 8 9

B为
0 0 0 5
0 0 4 6
0 0 5 0
0 2 0 8
1 3 7 9
2 0 8 0
0 5 0 0
4 6 0 0
5 0 0 0

C初始化后为
1 1 1 1

M=1 K=9 N=4 lda=9 ldb=4 ldb=4
C[0]=ALPHA*A[0]*B[0]+ALPHA*A[1]*B[4]+...+ALPHA*A[8]*B[32]=95
C[1]=107
C[2]=107
C[3]=95
 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

换成人能看懂的

                           B
                       [0 0 0 5
                        0 0 4 6
                        0 0 5 0
         A              0 2 0 8       C               C
[1 2 3 4 5 6 7 8 9]  *  1 3 7 9 + [1 1 1 1]==> [95 107 107 95]
                        2 0 8 0
                        0 5 0 0
                        4 6 0 0
                        5 0 0 0]
 
 1
2
3
4
5
6
7
8
9
10

所以这两个函数的意图很明显了，就是将卷积变换成了矩阵运算。一些有意思的数学技巧^_^!!!

最后简要的提一下gemm_nn 、gemm_tn、gemm_tt、gemm_nt他们之间的区别，他们的命名都是有意义的。这里的n指的是not transpose而t指的是transpose。例如nn就表示AB都不转置。

接着我们回到forward_convolutional_layer函数

//forward_convolutional_layer
    if(l.batch_normalize){
        forward_batchnorm_layer(l, net);   
 
 1
2
3

这里有出现一个有用的函数forward_batchnorm_layer

0x0103 forward_batchnorm_layer

void forward_batchnorm_layer(layer l, network net)
{
    if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1);
    copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
 
 1
2
3
4

上来就是一个函数copy_cpu

void copy_cpu(int N, float *X, int INCX, float *Y, int INCY)
{
    int i;
    for(i = 0; i < N; ++i) Y[i*INCY] = X[i*INCX];
}
 
 1
2
3
4
5

我们先看一下输入的参数分别表示的是什么意思。如果我们定义了BATCHNORM，那么这里的N表示一个batch中的输出参数个数，x表示指向输入参数的指针，y表示指向输出参数的指针。那函数的目的很简单，将net中的输入，复制到layer中的输出；如果没有定义BATCHNORM，那么将layer中的输出复制到layer中的x。接着看后面（可以参考这篇论文Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift）

//forward_batchnorm_layer
    if(net.train){
        mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean);
        variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance);

        scal_cpu(l.out_c, .99, l.rolling_mean, 1);
        axpy_cpu(l.out_c, .01, l.mean, 1, l.rolling_mean, 1);
        scal_cpu(l.out_c, .99, l.rolling_variance, 1);
        axpy_cpu(l.out_c, .01, l.variance, 1, l.rolling_variance, 1);

        normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w);   
        copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
    } else {
        normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h*l.out_w);
    }
    scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
    add_bias(l.output, l.biases, l.batch, l.out_c, l.out_h*l.out_w);
}
 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

我准备把这几个函数放在一块解析，因为这几个函数都不大。先看mean_cpu

void mean_cpu(float *x, int batch, int filters, int spatial, float *mean)
{
    float scale = 1./(batch * spatial);//求分母
    int i,j,k;
    for(i = 0; i < filters; ++i){
        mean[i] = 0;
        for(j = 0; j < batch; ++j){
            for(k = 0; k < spatial; ++k){
                int index = j*filters*spatial + i*spatial + k;
                mean[i] += x[index];
            }
        }
        mean[i] *= scale;
    }
}
 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15

x:指向layer的输出
batch:一个batch的大小
filters:输出的图像通道数，在这里同样可以理解为卷积核个数
spatial:输出图片的大小
mean:指向保存结果的指针

还是举个例子

x [95 107 107 95 1 2 3 4]
batch = 1
filters = 2
spatial = 2x2 = 4

scale = 1/(1x4) = 0.25
第一次循环
i=0 j=0
mean[0]=0
k=0
index=0
mean[0]=0+x[0]=95
...
mean[0]=101 mean[1]=2.5
 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14

那么这个函数的意义就很明晰了。它要求出的是不同通道下所有输入图像的均值。对应BN论文中的这个公式

1m∑mi=1xi //mini-batch mean

接着看variance_cpu函数

void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance)
{
    float scale = 1./(batch * spatial - 1);//注意这里的减1操作
    int i,j,k;
    for(i = 0; i < filters; ++i){
        variance[i] = 0;
        for(j = 0; j < batch; ++j){
            for(k = 0; k < spatial; ++k){
                int index = j*filters*spatial + i*spatial + k;
                variance[i] += pow((x[index] - mean[i]), 2);
            }
        }
        variance[i] *= scale;
    }
}
 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15

x:指向layer的输出指针
mean:指向上面函数输出的均值
batch:batch大小
filters:输出的图像通道数，在这里同样可以理解为卷积核个数
spatial:输出图片的大小
variance:指向保存结果的指针

举个例子

x [95 107 107 95 1 2 3 4]
mean [101 25]
batch = 1
filters = 2
spatial = 2x2 = 4  
scale = 1/(1x4 - 1)=0.333
i=0
variance[0]=0
j=0 k=0
index=0
variance[0] = 0+(95-101)^2
...
variance[0]=48 variance[1]=1.66666675
 
 1
2
3
4
5
6
7
8
9
10
11
12
13

那么这个函数的意义就很明晰了。它要求出的是不同通道下所有输入图像的样本方差（对于n个数据，如果n-1个确定了，那么剩下的那个就确定了（前提知道均值，均值*n - (n-1)数））。对应BN论文中的这个公式

1m∑mi=1(xi−μβ) //mini-batch variance

接着看scal_cpu函数

void scal_cpu(int N, float ALPHA, float *X, int INCX)
{
    int i;
    for(i = 0; i < N; ++i) X[i*INCX] *= ALPHA;
}
 
 1
2
3
4
5

这个函数很简单，就是将输入的数据乘以一个系数。

接着看axpy_cpu函数

void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
{
    int i;
    for(i = 0; i < N; ++i) Y[i*INCY] += ALPHA*X[i*INCX];
}
 
 1
2
3
4
5

这个函数也很简单，就是Y =ALPHA*X + Y

接着看normalize_cpu这个函数

void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial)
{
    int b, f, i;
    for(b = 0; b < batch; ++b){
        for(f = 0; f < filters; ++f){
            for(i = 0; i < spatial; ++i){
                int index = b*filters*spatial + f*spatial + i;
                x[index] = (x[index] - mean[f])/(sqrt(variance[f]) + .000001f);
            }
        }
    }
}
 
 1
2
3
4
5
6
7
8
9
10
11
12

x：layer的输出图像
mean:前面算的均值
variance:前面算的样本方差
batch:batch大小
filters:输出的图像通道数，在这里同样可以理解为卷积核个数
spatial:输出图片的大小

还是举个例子

x [95 107 107 95 1 2 3 4]
mean [101 25]
variance [48 1.66666675]
batch=1
filters=2
spatial = 2x2 = 4
进入第一层循环
b=0 f=0 i=0
index = 0
x[0] = (x[0]-m[0])/(sqrt(variance[0]) + 0.000001f) = -1.44
...
x[0]=-0.866025329 x[0]=0.866025329  x[0]=0.866025329 x[0]=-0.866025329
x[0]=-1.16189408  x[0]=-0.387298018 x[0]=0.387298018 x[0]=1.16189408

 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14

这个函数的作用就是一个归一化处理。对应BN论文中的这个公式

xi−μβσ2β+ϵ√ //normalize

接着看scale_bias和add_bias函数

void scale_bias(float *output, float *scales, int batch, int n, int size)
{
    int i,j,b;
    for(b = 0; b < batch; ++b){
        for(i = 0; i < n; ++i){
            for(j = 0; j < size; ++j){
                output[(b*n + i)*size + j] *= scales[i];
            }
        }
    }
}
void add_bias(float *output, float *biases, int batch, int n, int size)
{
    int i,j,b;
    for(b = 0; b < batch; ++b){
        for(i = 0; i < n; ++i){
            for(j = 0; j < size; ++j){
                output[(b*n + i)*size + j] += biases[i];
            }
        }
    }
}
 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

这两个函数的意义都很简单。对应BN论文中的这个公式

γxi^+β

接着我们回到forward_convolutional_layer函数

//forward_convolutional_layer
    if(l.batch_normalize){
        forward_batchnorm_layer(l, net);
    } else {
        add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w);
    }

    activate_array(l.output, l.outputs*l.batch, l.activation);
    if(l.binary || l.xnor) swap_binary(&l);
}
 
 1
2
3
4
5
6
7
8
9
10

如果没有设置batch_normalize，直接添加偏向就完事了。接着是activate_array函数

void activate_array(float *x, const int n, const ACTIVATION a)
{
    int i;
    for(i = 0; i < n; ++i){
        x[i] = activate(x[i], a);
    }
}
float activate(float x, ACTIVATION a)
{
    switch(a){
        case LINEAR:
            return linear_activate(x);
        case LOGISTIC:
            return logistic_activate(x);
        case LOGGY:
            return loggy_activate(x);
        case RELU:
            return relu_activate(x);
        case ELU:
            return elu_activate(x);
        case RELIE:
            return relie_activate(x);
        case RAMP:
            return ramp_activate(x);
        case LEAKY:
            return leaky_activate(x);
        case TANH:
            return tanh_activate(x);
        case PLSE:
            return plse_activate(x);
        case STAIR:
            return stair_activate(x);
        case HARDTAN:
            return hardtan_activate(x);
        case LHTAN:
            return lhtan_activate(x);
    }
    return 0;
}
static inline float linear_activate(float x){return x;}
static inline float logistic_activate(float x){return 1./(1. + exp(-x));}
static inline float loggy_activate(float x){return 2./(1. + exp(-x)) - 1;}
static inline float relu_activate(float x){return x*(x>0);}
static inline float elu_activate(float x){return (x >= 0)*x + (x < 0)*(exp(x)-1);}
static inline float relie_activate(float x){return (x>0) ? x : .01*x;}
static inline float ramp_activate(float x){return x*(x>0)+.1*x;}
static inline float leaky_activate(float x){return (x>0) ? x : .1*x;}
static inline float tanh_activate(float x){return (exp(2*x)-1)/(exp(2*x)+1);}
 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48

这个函数的意义也很明显，就是将layer的输出图像，输入到我们设置的激活函数中。至此forward_convolutional_layer中的问题全部解决。

好的，这篇文章的篇幅有些长了，我们把剩余部分放到下一篇

觉得不错，点个赞吧b(￣▽￣)d

由于本人水平有限，文中有不对之处，希望大家指出，谢谢^_^!

下一篇开始分析backward_convolutional_layer，敬请关注。

一个新新的小白

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
YOLOv2源码分析（二）

版权声明：本文为博主原创文章，未经博主允许不得转载。有事联系：coordinate@live.com https://blog.csdn.net/qq_17550379/article/details/78850099 文章全部YOLOv2源码分析接着上一讲没有讲...
复制链接

扫一扫