原文地址:https://blog.csdn.net/u014540717/article/details/53232426
一、主函数void forward_network(network net, network_state state)
//network.c
void forward_network(network net, network_state state)
{
state.workspace = net.workspace;
int i;
for(i = 0; i < net.n; ++i){
state.index = i;
layer l = net.layers[i];
//如果delta不为零,那么就把所有的输入值输入乘一个系数,用float *delta指针指向它
if(l.delta){
scal_cpu(l.outputs * l.batch, 0, l.delta, 1);
}
//从这里开始我们可以一层一层分析了,重复的层就不再分析了,顺序如下:
//[convolutional]
//[maxpool]
//[local]
//[dropout]
//[connected]
//[detection]
l.forward(l, state);
state.input = l.output;
}
}
1、前向传播-convolutional层
//convolutional_layer.c
void forward_convolutional_layer(convolutional_layer l, network_state state)
{
//获取卷积层输出的长、宽
int out_h = convolutional_out_height(l);
int out_w = convolutional_out_width(l);
int i;
//初始化,将输出的数据全部赋值0
fill_cpu(l.outputs*l.batch, 0, l.output, 1);
/*
if(l.binary){
binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights);
binarize_weights2(l.weights, l.n, l.c*l.size*l.size, l.cweights, l.scales);
swap_binary(&l);
}
*/
/*
if(l.binary){
int m = l.n;
int k = l.size*l.size*l.c;
int n = out_h*out_w;
char *a = l.cweights;
float *b = state.workspace;
float *c = l.output;
for(i = 0; i < l.batch; ++i){
im2col_cpu(state.input, l.c, l.h, l.w,
l.size, l.stride, l.pad, b);
gemm_bin(m,n,k,1,a,k,b,n,c,n);
c += n*m;
state.input += l.c*l.h*l.w;
}
scale_bias(l.output, l.scales, l.batch, l.n, out_h*out_w);
add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
activate_array(l.output, m*n*l.batch, l.activation);
return;
}
*/
if(l.xnor){
binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights);
swap_binary(&l);
binarize_cpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input);
state.input = l.binary_input;
}
//m是卷积核的个数,k是每个卷积核的参数数量(l.size是卷积核的大小),n是每个输出feature map的像素个数
int m = l.n;
int k = l.size*l.size*l.c;
int n = out_h*out_w;
if (l.xnor && l.c%32 == 0 && AI2) {
forward_xnor_layer(l, state);
printf("xnor\n");
} else {
//weights顾名思义,就是卷积核的参数,`$grep -rn "l.weights"`可以看到:
//l.weights = calloc(c*n*size*size, sizeof(float))
//说白了a是指向权重的指针,b是指向工作空间指针,c是指向输出的指针
float *a = l.weights;
float *b = state.workspace;
float *c = l.output;
for(i = 0; i < l.batch; ++i){
//im2col就是image to column,就是将图像依照卷积核的大小拉伸为列向量,方便矩阵运算
im2col_cpu(state.input, l.c, l.h, l.w,
l.size, l.stride, l.pad, b);
//这个函数实现矩阵运算,也就是卷积运算
gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
c += n*m;
//更新输入
state.input += l.c*l.h*l.w;
}
}
//BN层,加速收敛
if(l.batch_normalize){
forward_batchnorm_layer(l, state);
}
//添加偏置项
add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
//非线性变化,leaky RELU层,非常简单,不多做介绍
activate_array(l.output, m*n*l.batch, l.activation);
//不太清楚binary和xnor是什么意思,希望有了解的留言,谢谢~
if(l.binary || l.xnor) swap_binary(&l);
}
函数剖析
a. im2col_cpu()
这个函数还是很重要的,我们来分析下。这个函数是从caffe中移植过来的
//im2col.c
//From Berkeley Vision's Caffe!
//https://github.com/BVLC/caffe/blob/master/LICENSE
void im2col_cpu(float* data_im,
int channels, int height, int width,
int ksize, int stride, int pad, float* data_col)
{
int c,h,w;
int height_col = (height + 2*pad - ksize) / stride + 1;
int width_col = (width + 2*pad - ksize) / stride + 1;
int channels_col = channels * ksize * ksize;
//最外层循环是每个卷积核的参数个数
for (c = 0; c < channels_col; ++c) {
int w_offset = c % ksize;
int h_offset = (c / ksize) % ksize;
int c_im = c / ksize / ksize;
//这两层循环是用卷积核把图像遍历一遍,这说起来比较晦涩,一会儿画个图来理解,很简单~
for (h = 0; h < height_col; ++h) {
for (w = 0; w < width_col; ++w) {
int im_row = h_offset + h * stride;
int im_col = w_offset + w * stride;
int col_index = (c * height_col + h) * width_col + w;
data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
im_row, im_col, c_im, pad);
}
}
}
}
float im2col_get_pixel(float *im, int height, int width, int channels,
int row, int col, int channel, int pad)
{
row -= pad;
col -= pad;
if (row < 0 || col < 0 ||
row >= height || col >= width) return 0;
return im[col + width*(row + height*channel)];
}
我画了下面的图来帮助理解下im2col_cpu()这个函数,为了方便理解,这里假设图像尺寸是5*5, stride=2,kernel_size=3
float *b指向state.workspace这个工作空间,也就是把原始数据变成行向量放到工作空间里,然后进行卷积计算
b. gemm()
这个函数实现了卷积的运算
//gemm.c
void gemm(int TA, int TB, int M, int N, int K, float ALPHA,
float *A, int lda,
float *B, int ldb,
float BETA,
float *C, int ldc)
{
gemm_cpu( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
}
void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA,
float *A, int lda,
float *B, int ldb,
float BETA,
float *C, int ldc)
{
//printf("cpu: %d %d %d %d %d %f %d %d %f %d\n",TA, TB, M, N, K, ALPHA, lda, ldb, BETA, ldc);
int i, j;
for(i = 0; i < M; ++i){
for(j = 0; j < N; ++j){
C[i*ldc + j] *= BETA;
}
}
if(!TA && !TB)
//调用这个函数
gemm_nn(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
else if(TA && !TB)
gemm_tn(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
else if(!TA && TB)
gemm_nt(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
else
gemm_tt(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
}
void gemm_nn(int M, int N, int K, float ALPHA,
float *A, int lda,
float *B, int ldb,
float *C, int ldc)
{
int i,j,k;
//这个函数同样一会儿画图表示吧,说起来太费劲。。。
for(i = 0; i < M; ++i){
for(k = 0; k < K; ++k){
//关键字请求编译器尽可能的将变量存在CPU内部寄存器中,而不是通过内存寻址访问,以提高效率。
register float A_PART = ALPHA*A[i*lda+k];
for(j = 0; j < N; ++j){
C[i*ldc+j] += A_PART*B[k*ldb+j];
}
}
}
}
这个函数的将卷积后的值放入c所指向的内存中(最终生成number of kernel个2*2的feature map)
函数结束后,开始循环每一个batch,卷积计算结果依次向后放
c. forward_batchnorm_layer()
这个函数实现batch normalization,加速了训练的收敛过程,详细见这篇论文 Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift
//batchnorm_layer.c
void forward_batchnorm_layer(layer l, network_state state)
{
if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
if(l.type == CONNECTED){
l.out_c = l.outputs;
l.out_h = l.out_w = 1;
}
if(state.train){
mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean);
variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance);
normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w);
} else {
normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h*l.out_w);
}
scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
}
//blas.c
//计算均值
void mean_cpu(float *x, int batch, int filters, int spatial, float *mean)
{
float scale = 1./(batch * spatial);
int i,j,k;
//注意,这里的均值是不同batch的同一维度的feature的均值
for(i = 0; i < filters; ++i){
mean[i] = 0;
for(j = 0; j < batch; ++j){
for(k = 0; k < spatial; ++k){
int index = j*filters*spatial + i*spatial + k;
mean[i] += x[index];
}
}
mean[i] *= scale;
}
}
//计算方差
void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance)
{
float scale = 1./(batch * spatial - 1);
int i,j,k;
for(i = 0; i < filters; ++i){
variance[i] = 0;
for(j = 0; j < batch; ++j){
for(k = 0; k < spatial; ++k){
int index = j*filters*spatial + i*spatial + k;
variance[i] += pow((x[index] - mean[i]), 2);
}
}
variance[i] *= scale;
}
}
//归一化
void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial)
{
int b, f, i;
for(b = 0; b < batch; ++b){
for(f = 0; f < filters; ++f){
for(i = 0; i < spatial; ++i){
int index = b*filters*spatial + f*spatial + i;
//公式中的ε=.000001f
x[index] = (x[index] - mean[f])/(sqrt(variance[f]) + .000001f);
}
}
}
}
//convolutional_layer.c
void scale_bias(float *output, float *scales, int batch, int n, int size)
{
int i,j,b;
for(b = 0; b < batch; ++b){
for(i = 0; i < n; ++i){
for(j = 0; j < size; ++j){
//scales就是创建convolutional_layer时分配的l.scales,值全是1
output[(b*n + i)*size + j] *= scales[i];
}
}
}
}
z这三个函数分别对应论文中的如下公式:
d. add_bias()
这个函数和scale_bias()一模一样,是什么意思?明白的给解释一下呗~
//convolutional_layer.c
void scale_bias(float *output, float *scales, int batch, int n, int size)
{
int i,j,b;
for(b = 0; b < batch; ++b){
for(i = 0; i < n; ++i){
for(j = 0; j < size; ++j){
output[(b*n + i)*size + j] *= scales[i];
}
}
}
}
2、前向传播-maxpool layer
//池化层相对要简单很多,如果理解了卷基层,这一层就很好理解
3、前向传播-local layer
顾名思义,local层就是只看前一层的一部分,作者这里选择了左上角的区域
void forward_local_layer(const local_layer l, network_state state)
{
int out_h = local_out_height(l);
int out_w = local_out_width(l);
int i, j;
int locations = out_h * out_w;
for(i = 0; i < l.batch; ++i){
copy_cpu(l.outputs, l.biases, 1, l.output + i*l.outputs, 1);
}
for(i = 0; i < l.batch; ++i){
float *input = state.input + i*l.w*l.h*l.c;
im2col_cpu(input, l.c, l.h, l.w,
l.size, l.stride, l.pad, l.col_image);
float *output = l.output + i*l.outputs;
for(j = 0; j < locations; ++j){
float *a = l.weights + j*l.size*l.size*l.c*l.n;
float *b = l.col_image + j;
float *c = output + j;
int m = l.n;
//n=1说明了作者只取了左上角的local区域,很容易想明白~
int n = 1;
int k = l.size*l.size*l.c;
gemm(0,0,m,n,k,1,a,k,b,locations,1,c,locations);
}
}
activate_array(l.output, l.outputs*l.batch, l.activation);
}
4、前向传播-dropout layer
dropout层主要是为了防止过拟合,详细可以自己搜索,这里不做过多介
5、前向传播-connected layer
void forward_connected_layer(connected_layer l, network_state state)
{
int i;
fill_cpu(l.outputs*l.batch, 0, l.output, 1);
int m = l.batch;
int k = l.inputs;
int n = l.outputs;
float *a = state.input;
float *b = l.weights;
float *c = l.output;
//注意这里的TB=1了,所以调用了gemm_tn()这个函数,下面会有介绍
gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
if(l.batch_normalize){
if(state.train){
mean_cpu(l.output, l.batch, l.outputs, 1, l.mean);
variance_cpu(l.output, l.mean, l.batch, l.outputs, 1, l.variance);
//将l.rolling_mean所有值赋0.95(移动平均什么意思呢?自己百度吧,数据分析用的很多~)
scal_cpu(l.outputs, .95, l.rolling_mean, 1);
//将l.rolling_mean的值加上0.5*l.mean
axpy_cpu(l.outputs, .05, l.mean, 1, l.rolling_mean, 1);
//将l.rolling_variance所有值赋0.95
scal_cpu(l.outputs, .95, l.rolling_variance, 1);
//将l.rolling_variance的值加上0.5*l.variance
axpy_cpu(l.outputs, .05, l.variance, 1, l.rolling_variance, 1);
//将l.output的值赋值到l.x,此时l.x是没有经过BN的
copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
//BN
normalize_cpu(l.output, l.mean, l.variance, l.batch, l.outputs, 1);
//将l.output的值赋值到l.x_norm,此时l.x_norm是经过BN之后的数据
copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
} else {
normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.outputs, 1);
}
scale_bias(l.output, l.scales, l.batch, l.outputs, 1);
}
for(i = 0; i < l.batch; ++i){
axpy_cpu(l.outputs, 1, l.biases, 1, l.output + i*l.outputs, 1);
}
//线性变换返回值不变
activate_array(l.output, l.outputs*l.batch, l.activation);
}
//gemm.c
void gemm_nt(int M, int N, int K, float ALPHA,
float *A, int lda,
float *B, int ldb,
float *C, int ldc)
{
int i,j,k;
//M=batch,每个样本有N(yolo.train.cfg中是1715=S×S×(B∗5+C))个输出
for(i = 0; i < M; ++i){
for(j = 0; j < N; ++j){
register float sum = 0;
//K是inputs,即输入个数
for(k = 0; k < K; ++k){
//输入项和权重项对应相乘相加
sum += ALPHA*A[i*lda+k]*B[j*ldb + k];
}
C[i*ldc+j] += sum;
}
}
}
6、前向传播-detectiondetection layer
这是最后一层了,这也是作者论文的精髓所在,希望大家能对比论文认真看一下。
前向传播终于要结束了,这篇博文有点小长~~
void forward_detection_layer(const detection_layer l, network_state state)
{
int locations = l.side*l.side;
int i,j;
memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float));
//if(l.reorg) reorg(l.output, l.w*l.h, size*l.n, l.batch, 1);
int b;
//这里的softmax=0,所以最后竟然都没有softmax层……
if (l.softmax){
for(b = 0; b < l.batch; ++b){
int index = b*l.inputs;
for (i = 0; i < locations; ++i) {
int offset = i*l.classes;
softmax(l.output + index + offset, l.classes, 1,
l.output + index + offset);
}
}
}
//训练的时候才需要cost function
if(state.train){
float avg_iou = 0;
float avg_cat = 0;
float avg_allcat = 0;
float avg_obj = 0;
float avg_anyobj = 0;
int count = 0;
*(l.cost) = 0;
int size = l.inputs * l.batch;
/*void *memset(void *s, int ch, size_t n);
memset是计算机中C/C++语言函数。将s所指向的某一块内存中的前n个 字节的内容全部设置
为ch指定的ASCII值*/
memset(l.delta, 0, size * sizeof(float));//l.delta存放的loss function的没一项
for (b = 0; b < l.batch; ++b){
int index = b*l.inputs;
//locations = 7*7
for (i = 0; i < locations; ++i) {
//coords包括x, y, w, h,1代表的是置信度
//truth_index是真实值的坐标索引
int truth_index = (b*locations + i)*(1+l.coords+l.classes);
int is_obj = state.truth[truth_index];
//计算置信度的损失
for (j = 0; j < l.n; ++j) {
//p_index是预测值的坐标索引,每个网格预测l.n个框,这里l.n=3(cfg文件中的num值),论文中是2
int p_index = index + locations*l.classes + i*l.n + j;
l.delta[p_index] = l.noobject_scale*(0 - l.output[p_index]);
//对应论文公式,这里先假设B个框中都没有物体
*(l.cost) += l.noobject_scale*pow(l.output[p_index], 2);
avg_anyobj += l.output[p_index];
}
int best_index = -1;
float best_iou = 0;
float best_rmse = 20;
if (!is_obj){
continue;
}
int class_index = index + i*l.classes;
for(j = 0; j < l.classes; ++j) {
//计算类别的损失
l.delta[class_index+j] = l.class_scale * (state.truth[truth_index+1+j] - l.output[class_index+j]);
*(l.cost) += l.class_scale * pow(state.truth[truth_index+1+j] - l.output[class_index+j], 2);
if(state.truth[truth_index + 1 + j]) avg_cat += l.output[class_index+j];
avg_allcat += l.output[class_index+j];
}
//计算位置信息的损失
box truth = float_to_box(state.truth + truth_index + 1 + l.classes);
truth.x /= l.side;
truth.y /= l.side;
/*寻找最后预测框(We only predict one set of class probabilities per grid cell,
regardless of the number of boxes B)*/
for(j = 0; j < l.n; ++j){
int box_index = index + locations*(l.classes + l.n) + (i*l.n + j) * l.coords;
box out = float_to_box(l.output + box_index);
out.x /= l.side;
out.y /= l.side;
if (l.sqrt){
out.w = out.w*out.w;
out.h = out.h*out.h;
}
//计算iou的值
float iou = box_iou(out, truth);
//iou = 0;
//计算均方根误差(root-mean-square error)
float rmse = box_rmse(out, truth);
//选出iou最大或者均方根误差最小的那个框作为最后预测框~
if(best_iou > 0 || iou > 0){
if(iou > best_iou){
best_iou = iou;
best_index = j;
}
}else{
if(rmse < best_rmse){
best_rmse = rmse;
best_index = j;
}
}
}
//强制确定一个最后预测框
if(l.forced){
if(truth.w*truth.h < .1){
best_index = 1;
}else{
best_index = 0;
}
}
//随机确定一个最后预测框~
if(l.random && *(state.net.seen) < 64000){
best_index = rand()%l.n;
}
//预测的框的索引
int box_index = index + locations*(l.classes + l.n) + (i*l.n + best_index) * l.coords;
//真实框的索引
int tbox_index = truth_index + 1 + l.classes;
box out = float_to_box(l.output + box_index);
out.x /= l.side;
out.y /= l.side;
if (l.sqrt) {
out.w = out.w*out.w;
out.h = out.h*out.h;
}
float iou = box_iou(out, truth);
//printf("%d,", best_index);
int p_index = index + locations*l.classes + i*l.n + best_index;
//前面假设了B个框中都没有物体来计算损失,这里再把有物体的那个减掉
*(l.cost) -= l.noobject_scale * pow(l.output[p_index], 2);
*(l.cost) += l.object_scale * pow(1-l.output[p_index], 2);
avg_obj += l.output[p_index];
l.delta[p_index] = l.object_scale * (1.-l.output[p_index]);
if(l.rescore){
l.delta[p_index] = l.object_scale * (iou - l.output[p_index]);
}
l.delta[box_index+0] = l.coord_scale*(state.truth[tbox_index + 0] - l.output[box_index + 0]);
l.delta[box_index+1] = l.coord_scale*(state.truth[tbox_index + 1] - l.output[box_index + 1]);
l.delta[box_index+2] = l.coord_scale*(state.truth[tbox_index + 2] - l.output[box_index + 2]);
l.delta[box_index+3] = l.coord_scale*(state.truth[tbox_index + 3] - l.output[box_index + 3]);
if(l.sqrt){
l.delta[box_index+2] = l.coord_scale*(sqrt(state.truth[tbox_index + 2]) - l.output[box_index + 2]);
l.delta[box_index+3] = l.coord_scale*(sqrt(state.truth[tbox_index + 3]) - l.output[box_index + 3]);
}
//把iou作为损失,这包含了x,y,w,h四个参数,其实后来没用iou来计算损失,而是论文中给的公式
*(l.cost) += pow(1-iou, 2);
avg_iou += iou;
++count;
}
}
//论文中没用到
if(0){
float *costs = calloc(l.batch*locations*l.n, sizeof(float));
for (b = 0; b < l.batch; ++b) {
int index = b*l.inputs;
for (i = 0; i < locations; ++i) {
for (j = 0; j < l.n; ++j) {
int p_index = index + locations*l.classes + i*l.n + j;
costs[b*locations*l.n + i*l.n + j] = l.delta[p_index]*l.delta[p_index];
}
}
}
int indexes[100];
top_k(costs, l.batch*locations*l.n, 100, indexes);
float cutoff = costs[indexes[99]];
for (b = 0; b < l.batch; ++b) {
int index = b*l.inputs;
for (i = 0; i < locations; ++i) {
for (j = 0; j < l.n; ++j) {
int p_index = index + locations*l.classes + i*l.n + j;
if (l.delta[p_index]*l.delta[p_index] < cutoff) l.delta[p_index] = 0;
}
}
}
free(costs);
}
//前面的*(l.cost)其实可以注释掉了,因为前面都没用,到这里才计算loss
*(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
//打印log
printf("Detection Avg IOU: %f, Pos Cat: %f, All Cat: %f, Pos Obj: %f, Any Obj: %f, count: %d\n", avg_iou/count, avg_cat/count, avg_allcat/(count*l.classes), avg_obj/count, avg_anyobj/(l.batch*locations*l.n), count);
//if(l.reorg) reorg(l.delta, l.w*l.h, size*l.n, l.batch, 0);
}
}
二、总结
读到这里,我们已经完全掌握了YOLO代码的框架,我们大概总结下darknet的优缺点。
优点:
- 代码依赖项少,只有cuda,甚至连opencv都可以不需要,如果你在cpu平台,cuda都可以扔了(当然darknet的cup代码并没有做什么优化,跑起来就很慢)。这对于做工程的人来说是非常好的消息,因为我们可以很easy的将代码移植到其他平台
缺点:
-
在darknet中,所有层的lr都一样,这对微调造成了很大的困难,因为微调需要把前面几层的lr都设置的很小很小,然后主要训练最后一层的权重
-
总的来说就是darknet的接口确实很差,如果想把网络改成inception或者resnet的构架,需要改大量的代码,这对于验证模型可行性来说,非常浪费时间。你也应该能理解到为什么我们想要将代码移植到mxnet或者caffe上,然后在mxnet上做模型压缩了
下一篇是反向传播部分的代码,这部分是很重要的,但对与移植或者进一步改进网络来说,其实没必要理解反向传播部分的代码,但如果你想对CNN更深入的了解,可以继续看一下篇关于反向传播部分的内容。