深度学习知识十四 yolo v2 损失函数源码（训练核心代码）解读和其实现原理、网络的输出格式

最新推荐文章于 2023-10-25 21:58:46 发布

yangdeshun888

最新推荐文章于 2023-10-25 21:58:46 发布

阅读量7k

点赞数

分类专栏：深度学习

本文链接：https://blog.csdn.net/yangdashi888/article/details/78026320

版权

深度学习专栏收录该内容

86 篇文章 3 订阅

订阅专栏

前提说明：

1, 关于 yolo 和 yolo v2 的详细解释请移步至如下两个链接，或者直接看论文（我自己有想写 yolo 的教程，但思前想后下面两个链接中的文章质量实在是太好了_(:з」∠)_）

yolo: https://zhuanlan.zhihu.com/p/24916786?refer=xiaoleimlnote

yolo v2: https://zhuanlan.zhihu.com/p/25167153 （解析了yolov2的网络输出格式）

2, 本文仅解读 yolo v2 的 loss 函数的源码，该代码请使用如下命令

git clone https://github.com/pjreddie/darknet

后打开 src/region_layer.c 查看

3, yolo 的官方网站地址为：https://pjreddie.com/darknet/yolo/

4, 我调试代码时使用的命令是：

./darknet detector train cfg/voc.data cfg/yolo-voc.cfg darknet19_448.conv.23

5.最新版yolo v2的损失函数的源码解读(解释无GPU版本)，如下：

void forward_region_layer(const region_layer l, network_state state)
{
    int i,j,b,t,n;
	//size代表着每个box需要预测出来的参数。
    int size = l.coords + l.classes + 1;
    memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float));
    #ifndef GPU
    flatten(l.output, l.w*l.h, size*l.n, l.batch, 1);
    #endif
    for (b = 0; b < l.batch; ++b){
        for(i = 0; i < l.h*l.w*l.n; ++i){
            int index = size*i + b*l.outputs;
            l.output[index + 4] = logistic_activate(l.output[index + 4]);
        }
    }
#ifndef GPU
    if (l.softmax_tree){
        for (b = 0; b < l.batch; ++b){
            for(i = 0; i < l.h*l.w*l.n; ++i){
                int index = size*i + b*l.outputs;
                softmax_tree(l.output + index + 5, 1, 0, 1, l.softmax_tree, l.output + index + 5);
            }
        }
    } else if (l.softmax){
        for (b = 0; b < l.batch; ++b){
            for(i = 0; i < l.h*l.w*l.n; ++i){
                int index = size*i + b*l.outputs;
                softmax(l.output + index + 5, l.classes, 1, l.output + index + 5);
            }
        }
    }
#endif
    if(!state.train) return;
    memset(l.delta, 0, l.outputs * l.batch * sizeof(float));
    float avg_iou = 0;
    float recall = 0;
    float avg_cat = 0;
    float avg_obj = 0;
    float avg_anyobj = 0;
    int count = 0;
    int class_count = 0;
    *(l.cost) = 0;
	//这里是对批处理的所有图像进行前向求损失值。
    for (b = 0; b < l.batch; ++b) {
		//没有使用这个softmax分类器，即不会进入这部分代码。
        if(l.softmax_tree){
            int onlyclass = 0;
            for(t = 0; t < 30; ++t){
                box truth = float_to_box(state.truth + t*5 + b*l.truths);
                if(!truth.x) break;
                int class = state.truth[t*5 + b*l.truths + 4];
                float maxp = 0;
                int maxi = 0;
                if(truth.x > 100000 && truth.y > 100000){
                    for(n = 0; n < l.n*l.w*l.h; ++n){
                        int index = size*n + b*l.outputs + 5;
                        float scale =  l.output[index-1];
                        float p = scale*get_hierarchy_probability(l.output + index, l.softmax_tree, class);
                        if(p > maxp){
                            maxp = p;
                            maxi = n;
                        }
                    }
                    int index = size*maxi + b*l.outputs + 5;
                    delta_region_class(l.output, l.delta, index, class, l.classes, l.softmax_tree, l.class_scale, &avg_cat);
                    ++class_count;
                    onlyclass = 1;
                    break;
                }
            }
            if(onlyclass) continue;
        }
		/*
		这里的l.h,l.w分别是最后卷积输出的特征图分辨率。l.n是anchor box的个数，这个机制是借鉴Faster R-CNN
		的回归方法。l.n这个参数跟配置文件的anchors、num有关,值就是num一样。其跟V1版的不同,V1版的是不管最后输出
		的特征图分辨率多少都是把起分成7*7个cell,而V2的每个特征点就是一个cell,优点就是：能回归和识别更小的物体。
		*/
        for (j = 0; j < l.h; ++j) {
            for (i = 0; i < l.w; ++i) {
				//这个l.n是代表着特征点需要进行预测的不同尺寸的box个数，box宽高大小跟配置文件里的anchor系数有关。
                for (n = 0; n < l.n; ++n) {
                    int index = size*(j*l.w*l.n + i*l.n + n) + b*l.outputs;
                    box pred = get_region_box(l.output, l.biases, n, index, i, j, l.w, l.h);
                    float best_iou = 0;
                    int best_class = -1;
					//这里是假设每个特征点cell最多只能有30个物体坐落在相同位置。其实这里的阈值影响不大的，其主要跟truth.x有关。
                    for(t = 0; t <30; ++t){
						// get truth_box's x, y, w, h  
                        box truth = float_to_box(state.truth + t*5 + b*l.truths);
						// 遍历完图片中的所有物体后退出
						if (!truth.x)
							break;
                        float iou = box_iou(pred, truth);
						//选出iou最大那个框作为最后预测框～
                        if (iou > best_iou) {
                            best_class = state.truth[t*5 + b*l.truths + 4];
                            best_iou = iou;
                        }	
                    }
					//计算有没有目标的梯度
                    avg_anyobj += l.output[index + 4];
                    l.delta[index + 4] = l.noobject_scale * ((0 - l.output[index + 4]) * logistic_gradient(l.output[index + 4]));
                    if(l.classfix == -1) l.delta[index + 4] = l.noobject_scale * ((best_iou - l.output[index + 4]) * logistic_gradient(l.output[index + 4]));
                    else{
                        if (best_iou > l.thresh) {
                            l.delta[index + 4] = 0;
                            if(l.classfix > 0){
                                delta_region_class(l.output, l.delta, index + 5, best_class, l.classes, l.softmax_tree, l.class_scale*(l.classfix == 2 ? l.output[index + 4] : 1), &avg_cat);
                                ++class_count;
                            }
                        }
                    }
					//这里要训练的图片张数达到12800后能进入
                    if(*(state.net.seen) < 12800){
                        box truth = {0};
                        truth.x = (i + .5)/l.w;
                        truth.y = (j + .5)/l.h;
                        truth.w = l.biases[2*n];
                        truth.h = l.biases[2*n+1];
                        if(DOABS){
                            truth.w = l.biases[2*n]/l.w;
                            truth.h = l.biases[2*n+1]/l.h;
                        }
						// 将预测的 tx, ty, tw, th 和 实际box计算得出的 tx',ty', tw', th' 的差存入 l.delta 
                        delta_region_box(truth, l.output, l.biases, n, index, i, j, l.w, l.h, l.delta, .01);
                    }
                }
            }
        }
		//运行到这步，则所有特征图上的所有格子都被标注，即代表有没有物体在此区域。
        for(t = 0; t < 30; ++t){
			// get truth_box's x, y, w, h  
            box truth = float_to_box(state.truth + t*5 + b*l.truths);
            if(!truth.x) break;
            float best_iou = 0;
            int best_index = 0;
            int best_n = 0;
            i = (truth.x * l.w);
            j = (truth.y * l.h);
            //printf("%d %f %d %f\n", i, truth.x*l.w, j, truth.y*l.h);
			// 上面获得了 truth box 的 x,y,w,h，这里讲 truth box 的 x,y 偏移到 0,0，记
			//为 truth_shift.x, truth_shift.y，这么做是为了方便计算 iou
            box truth_shift = truth;
            truth_shift.x = 0;
            truth_shift.y = 0;
            //printf("index %d %d\n",i, j);
		    //这里是计算具有真实物体的地方与anchor boxs的匹配值。
            for(n = 0; n < l.n; ++n){
				//获得box的index。其中size是每个box需要计算的参数，(j*l.w*l.n + i*l.n + n)计算的是第几个格子
				//b*l.outputs计算的是第几张输入图片的特征图，这样算就是为了计算位置。
                int index = size*(j*l.w*l.n + i*l.n + n) + b*l.outputs;
				//获得box的预测，这里先是坐标位置x，y，w，h，而剩下的两个confidence放到后面，
                box pred = get_region_box(l.output, l.biases, n, index, i, j, l.w, l.h);
				//box的w，h是根据anchors生成的，其中l.biases就是配置文件里的那些anchors参数
                if(l.bias_match){
                    pred.w = l.biases[2*n];
                    pred.h = l.biases[2*n+1];
                    if(DOABS){
                        pred.w = l.biases[2*n]/l.w;
                        pred.h = l.biases[2*n+1]/l.h;
                    }
                }
                //printf("pred: (%f, %f) %f x %f\n", pred.x, pred.y, pred.w, pred.h);
				//这里也把box位置移到0,0;这么做是为了方便计算IOU。
                pred.x = 0;
                pred.y = 0;
                float iou = box_iou(pred, truth_shift);
                if (iou > best_iou){
                    best_index = index;
                    best_iou = iou;
                    best_n = n;
                }
            }
            //printf("%d %f (%f, %f) %f x %f\n", best_n, best_iou, truth.x, truth.y, truth.w, truth.h);
			// 计算 box 和 truth box 的 iou  
			float iou = delta_region_box(truth, l.output, l.biases, best_n, best_index, i, j, l.w, l.h, l.delta, l.coord_scale);
			//如果大于阈值则召回率加1.
			if(iou > .5) recall += 1;
            avg_iou += iou;
			//运行到这里，位置的回归基本完成，下面主要是进行目标分类的操作
            //l.delta[best_index + 4] = iou - l.output[best_index + 4];
            avg_obj += l.output[best_index + 4];
			//这里logistic_gradient把具有目标的区域进行逻辑回归分类，计算其输出的类别分数。
            l.delta[best_index + 4] = l.object_scale * (1 - l.output[best_index + 4]) * logistic_gradient(l.output[best_index + 4]);
            if (l.rescore) {
				// 用 iou 代替上面的 1(经调试，l.rescore = 1，因此能走到这里)  
                l.delta[best_index + 4] = l.object_scale * (iou - l.output[best_index + 4]) * logistic_gradient(l.output[best_index + 4]);
            }

			// 获得真实的 class  
            int class = state.truth[t*5 + b*l.truths + 4];
            if (l.map) class = l.map[class];
			// 把所有 class 的预测概率与真实 class 的 0/1 的差 * scale，然后存入 l.delta 里相应 class 序号的位置  
            delta_region_class(l.output, l.delta, best_index + 5, class, l.classes, l.softmax_tree, l.class_scale, &avg_cat);
            ++count;
            ++class_count;
        }
    }
    //printf("\n");
    #ifndef GPU
    flatten(l.delta, l.w*l.h, size*l.n, l.batch, 0);
    #endif
	// 现在，l.delta 中的每一个位置都存放了 class、confidence、x, y, w, h 的差，于是通过 mag_array 遍历所有位置，计算每个位置的平方的和后开根  
	// 然后利用 pow 函数求平方 
    *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
    printf("Region Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, Avg Recall: %f,  count: %d\n", avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, count);
}

注：上面的代码解释是个人参考网上资料后的一些见解，其中如有不对的地方，大家可以指出了，通过修改完善造福更多人。

下面是yolov2的特征网络的输出信息结构：

其中xywh是检测到的目标的坐标信息，confidence是网络是否有目标的输出置信度，后面的classPr是网络输出每个类别的置信度。下面是网络最后一层的滤波核的计算公式：

filters=(classes + coords + 1)*(number of anchors)

6、yolov2训练完后进行根据输出求解box坐标的函数：

进行最后region层获取结果box，其如下：

    if(l.type == DETECTION){
        get_detection_boxes(l, 1, 1, demo_thresh, probs, boxes, 0);
    } else if (l.type == REGION){
        get_region_boxes(l, 1, 1, demo_thresh, probs, boxes, 0, 0);
    } else {
        error("Last layer must produce detections\n");
    }
    if (nms > 0) do_nms(boxes, probs, l.w*l.h*l.n, l.classes, nms);
    printf("\033[2J");
    printf("\033[1;1H");
    printf("\nFPS:%.1f\n",fps);
    printf("Objects:\n\n");

yolov2是运行get_region_boxes，yolov1是运行get_detection_boxes。下面以yolov2的进行讲解：

void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness, int *map)
{
	int i, j, n;
	float *predictions = l.output;
	for (i = 0; i < l.w*l.h; ++i){
		int row = i / l.w;
		int col = i % l.w;
		for (n = 0; n < l.n; ++n){
			int index = i*l.n + n;
			int p_index = index * (l.classes + 5) + 4;
			float scale = predictions[p_index];
			if (l.classfix == -1 && scale < .5) scale = 0;
			int box_index = index * (l.classes + 5);
            //这个就是对网络输出值进行计算检测框的相应信息。
			boxes[index] = get_region_box(predictions, l.biases, n, box_index, col, row, l.w, l.h);
			boxes[index].x *= w;
			boxes[index].y *= h;
			boxes[index].w *= w;
			boxes[index].h *= h;

			//取处网格里的预测是否有类别目标的分数值的第一个起始值分数
			int class_index = index * (l.classes + 5) + 5;
			if (l.softmax_tree){

				hierarchy_predictions(predictions + class_index, l.classes, l.softmax_tree, 0);
				int found = 0;
				if (map){
					for (j = 0; j < 200; ++j){
						float prob = scale*predictions[class_index + map[j]];
						probs[index][j] = (prob > thresh) ? prob : 0;
					}
				}
				else {
					for (j = l.classes - 1; j >= 0; --j){
						//如果某个类别的分数值大于0.5的阈值，则说明此次预测框里包含有此类目标
						if (!found && predictions[class_index + j] > .5){
							found = 1;
						}
						else {
							predictions[class_index + j] = 0;
						}
						float prob = predictions[class_index + j];
						probs[index][j] = (scale > thresh) ? prob : 0;
					}
				}
			}
			else {
				for (j = 0; j < l.classes; ++j){
					float prob = scale*predictions[class_index + j];
					probs[index][j] = (prob > thresh) ? prob : 0;
				}
			}
			if (only_objectness){
				probs[index][0] = scale;
			}
		}
	}
}

其中get_region_box函数的实现如下：

box get_region_box(float *x, float *biases, int n, int index, int i, int j, int w, int h)
{
	box b;
    //这个就是根据tx，ty，tw，th进行求解当前目标的实际cx cy w h等信息。
    //这里为啥加了个sigmoid，是因为计算损失的时候对网络输出的tx，ty，tw，th进行值限制，
    //让当前格子的加上偏置而跑到其它格子上。所以对tx ty tw th的值进行限制。
	b.x = (i + logistic_activate(x[index + 0])) / w;
	b.y = (j + logistic_activate(x[index + 1])) / h;
    //这里的biases是anchors里的值，可以理解为网络预测的w\h只是比例值，返回原图的时候需要结合 
    //anchors的值进行返回。则进行kmeans聚类的时候在原图resize到网络输入大小时，此时的w h 进行聚类即可。
	b.w = exp(x[index + 2]) * biases[2 * n];
	b.h = exp(x[index + 3]) * biases[2 * n + 1];
	if (DOABS){
		b.w = exp(x[index + 2]) * biases[2 * n] / w;
		b.h = exp(x[index + 3]) * biases[2 * n + 1] / h;
	}
	return b;
}

下面是进行delta_region_box求tx、ty、tw th信息损失loss的计算函数：

float delta_region_box(box truth, float *x, float *biases, int n, int index, int i, int j, int w, int h, float *delta, float scale)
{
	box pred = get_region_box(x, biases, n, index, i, j, w, h);
	float iou = box_iou(pred, truth);
    //真实目标的信息求当前目标跟这个i，j中心的偏置tx ty tw th
    //get_region_box基本是根据下面tx ty tw th四个公式的反计算来求得检测目标的位置的。
	float tx = (truth.x*w - i);
	float ty = (truth.y*h - j);
	float tw = log(truth.w / biases[2 * n]);
	float th = log(truth.h / biases[2 * n + 1]);
	if (DOABS){
		tw = log(truth.w*w / biases[2 * n]);
		th = log(truth.h*h / biases[2 * n + 1]);
	}
    //logistic_activate就相当于限制网络计算出来的tx，ty不会大于1，而出现越格子的问题。
	delta[index + 0] = scale * (tx - logistic_activate(x[index + 0])) * logistic_gradient(logistic_activate(x[index + 0]));
	delta[index + 1] = scale * (ty - logistic_activate(x[index + 1])) * logistic_gradient(logistic_activate(x[index + 1]));
	delta[index + 2] = scale * (tw - x[index + 2]);
	delta[index + 3] = scale * (th - x[index + 3]);
	return iou;
}

下面是yolov1的损失函数计算公式（yolov2损失计算总统跟v1的一样，区别在于v1 限制tw th的方式是用根号）：

而yolov2主要是对其中第一个coord的loss详细计算公式见delta_region_box。

其中在训练完后，进行前向求世界目标的坐标信息bx by bw bh 公式如下:

是当前网格左上角到图像左上角的距离，要先将网格大小归一化，即令一个网格的宽=1，高=1。是先验框的宽和高。σ是sigmoid函数。 tx ty tw th是要学习的参数，用函数实现如上面的get_region_box函数。

下面是yolov2读取anchors时候的代码，在parser.c里parse_region：

     layer l = make_region_layer(params.batch, params.w, params.h, num, classes, coords);
    char *a = option_find_str(options, "anchors", 0);
    if(a){
        int len = strlen(a);
        int n = 1;
        int i;
        for(i = 0; i < len; ++i){
            if (a[i] == ',') ++n;
        }
        for(i = 0; i < n; ++i){
            float bias = atof(a);
            l.biases[i] = bias;
            a = strchr(a, ',')+1;
        }
    }

yangdeshun888

关注

0
点赞
踩
10

收藏

觉得还不错? 一键收藏
4
评论
深度学习知识十四 yolo v2 损失函数源码（训练核心代码）解读和其实现原理、网络的输出格式

前提说明： 1, 关于 yolo 和 yolo v2 的详细解释请移步至如下两个链接，或者直接看论文（我自己有想写 yolo 的教程，但思前想后下面两个链接中的文章质量实在是太好了_(:з」∠)_） yolo:https://zhuanlan.zhihu.com/p/24916786?refer=xiaoleimlnote yolo v2:http...
复制链接

扫一扫

专栏目录