【darknet】|yolov3 推理

最新推荐文章于 2022-01-09 21:43:01 发布

rrr2

最新推荐文章于 2022-01-09 21:43:01 发布

阅读量519

点赞数

分类专栏： DARKNET

本文链接：https://blog.csdn.net/qq_35608277/article/details/116082119

版权

DARKNET 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

本文详细解析了Yolov3模型的内部工作原理，包括网络结构、特征提取过程、目标检测的实现，以及关键函数如get_network_boxes、make_network_boxes、num_detections和get_yolo_boxes等。通过对模型配置文件和权重的加载，展示了如何进行图像检测，并分析了检测结果。最后，介绍了非极大值抑制（NMS）在筛选重叠边界框中的作用。

摘要由CSDN通过智能技术生成

./darknet detector test cfg/coco.data cfg/yolov3.cfg model/yolov3.weights data/dog.jpg # 加载yolov3配置文件和模型参数进行检测

# yolov3 log 从36层截取：0-74层一共53个conv layer其余都是res layer即shortcut操作，75-105层为yolov3的特征交互层分为三种尺度
layer     filters    size              input                output
   36 res   33                  52 x  52 x 256   ->    52 x  52 x 256
   37 conv    512  3 x 3 / 2    52 x  52 x 256   ->    26 x  26 x 512  1.595 BFLOPs
   38 conv    256  1 x 1 / 1    26 x  26 x 512   ->    26 x  26 x 256  0.177 BFLOPs
   39 conv    512  3 x 3 / 1    26 x  26 x 256   ->    26 x  26 x 512  1.595 BFLOPs
   40 res   37                  26 x  26 x 512   ->    26 x  26 x 512
   41 conv    256  1 x 1 / 1    26 x  26 x 512   ->    26 x  26 x 256  0.177 BFLOPs
   42 conv    512  3 x 3 / 1    26 x  26 x 256   ->    26 x  26 x 512  1.595 BFLOPs
   43 res   40                  26 x  26 x 512   ->    26 x  26 x 512
   44 conv    256  1 x 1 / 1    26 x  26 x 512   ->    26 x  26 x 256  0.177 BFLOPs
   45 conv    512  3 x 3 / 1    26 x  26 x 256   ->    26 x  26 x 512  1.595 BFLOPs
   46 res   43                  26 x  26 x 512   ->    26 x  26 x 512
   47 conv    256  1 x 1 / 1    26 x  26 x 512   ->    26 x  26 x 256  0.177 BFLOPs
   48 conv    512  3 x 3 / 1    26 x  26 x 256   ->    26 x  26 x 512  1.595 BFLOPs
   49 res   46                  26 x  26 x 512   ->    26 x  26 x 512
   50 conv    256  1 x 1 / 1    26 x  26 x 512   ->    26 x  26 x 256  0.177 BFLOPs
   51 conv    512  3 x 3 / 1    26 x  26 x 256   ->    26 x  26 x 512  1.595 BFLOPs
   52 res   49                  26 x  26 x 512   ->    26 x  26 x 512
   53 conv    256  1 x 1 / 1    26 x  26 x 512   ->    26 x  26 x 256  0.177 BFLOPs
   54 conv    512  3 x 3 / 1    26 x  26 x 256   ->    26 x  26 x 512  1.595 BFLOPs
   55 res   52                  26 x  26 x 512   ->    26 x  26 x 512
   56 conv    256  1 x 1 / 1    26 x  26 x 512   ->    26 x  26 x 256  0.177 BFLOPs
   57 conv    512  3 x 3 / 1    26 x  26 x 256   ->    26 x  26 x 512  1.595 BFLOPs
   58 res   55                  26 x  26 x 512   ->    26 x  26 x 512
   59 conv    256  1 x 1 / 1    26 x  26 x 512   ->    26 x  26 x 256  0.177 BFLOPs
   60 conv    512  3 x 3 / 1    26 x  26 x 256   ->    26 x  26 x 512  1.595 BFLOPs
   61 res   58                  26 x  26 x 512   ->    26 x  26 x 512
   62 conv   1024  3 x 3 / 2    26 x  26 x 512   ->    13 x  13 x1024  1.595 BFLOPs
   63 conv    512  1 x 1 / 1    13 x  13 x1024   ->    13 x  13 x 512  0.177 BFLOPs
   64 conv   1024  3 x 3 / 1    13 x  13 x 512   ->    13 x  13 x1024  1.595 BFLOPs
   65 res   62                  13 x  13 x1024   ->    13 x  13 x1024
   66 conv    512  1 x 1 / 1    13 x  13 x1024   ->    13 x  13 x 512  0.177 BFLOPs
   67 conv   1024  3 x 3 / 1    13 x  13 x 512   ->    13 x  13 x1024  1.595 BFLOPs
   68 res   65                  13 x  13 x1024   ->    13 x  13 x1024
   69 conv    512  1 x 1 / 1    13 x  13 x1024   ->    13 x  13 x 512  0.177 BFLOPs
   70 conv   1024  3 x 3 / 1    13 x  13 x 512   ->    13 x  13 x1024  1.595 BFLOPs
   71 res   68                  13 x  13 x1024   ->    13 x  13 x1024
   72 conv    512  1 x 1 / 1    13 x  13 x1024   ->    13 x  13 x 512  0.177 BFLOPs
   73 conv   1024  3 x 3 / 1    13 x  13 x 512   ->    13 x  13 x1024  1.595 BFLOPs
   74 res   71                  13 x  13 x1024   ->    13 x  13 x1024
   75 conv    512  1 x 1 / 1    13 x  13 x1024   ->    13 x  13 x 512  0.177 BFLOPs
   76 conv   1024  3 x 3 / 1    13 x  13 x 512   ->    13 x  13 x1024  1.595 BFLOPs
   77 conv    512  1 x 1 / 1    13 x  13 x1024   ->    13 x  13 x 512  0.177 BFLOPs
   78 conv   1024  3 x 3 / 1    13 x  13 x 512   ->    13 x  13 x1024  1.595 BFLOPs
   79 conv    512  1 x 1 / 1    13 x  13 x1024   ->    13 x  13 x 512  0.177 BFLOPs
   80 conv   1024  3 x 3 / 1    13 x  13 x 512   ->    13 x  13 x1024  1.595 BFLOPs
   81 conv    255  1 x 1 / 1    13 x  13 x1024   ->    13 x  13 x 255  0.088 BFLOPs
   82 yolo # small尺寸的特征图 13*13*(3*(5+80))
   83 route  79
   84 conv    256  1 x 1 / 1    13 x  13 x 512   ->    13 x  13 x 256  0.044 BFLOPs
   85 upsample            2x    13 x  13 x 256   ->    26 x  26 x 256 # 对当前特征层进行上采样
   86 route  85 61 # concat 85和61层 起到特征合并的作用 类似FPN的思想
   87 conv    256  1 x 1 / 1    26 x  26 x 768   ->    26 x  26 x 256  0.266 BFLOPs
   88 conv    512  3 x 3 / 1    26 x  26 x 256   ->    26 x  26 x 512  1.595 BFLOPs
   89 conv    256  1 x 1 / 1    26 x  26 x 512   ->    26 x  26 x 256  0.177 BFLOPs
   90 conv    512  3 x 3 / 1    26 x  26 x 256   ->    26 x  26 x 512  1.595 BFLOPs
   91 conv    256  1 x 1 / 1    26 x  26 x 512   ->    26 x  26 x 256  0.177 BFLOPs
   92 conv    512  3 x 3 / 1    26 x  26 x 256   ->    26 x  26 x 512  1.595 BFLOPs
   93 conv    255  1 x 1 / 1    26 x  26 x 512   ->    26 x  26 x 255  0.177 BFLOPs
   94 yolo # middle尺寸的特征图 26*26*(3*(5+80))
   95 route  91
   96 conv    128  1 x 1 / 1    26 x  26 x 256   ->    26 x  26 x 128  0.044 BFLOPs
   97 upsample            2x    26 x  26 x 128   ->    52 x  52 x 128 # 上采样
   98 route  97 36 # cocat 97和36层
   99 conv    128  1 x 1 / 1    52 x  52 x 384   ->    52 x  52 x 128  0.266 BFLOPs
  100 conv    256  3 x 3 / 1    52 x  52 x 128   ->    52 x  52 x 256  1.595 BFLOPs
  101 conv    128  1 x 1 / 1    52 x  52 x 256   ->    52 x  52 x 128  0.177 BFLOPs
  102 conv    256  3 x 3 / 1    52 x  52 x 128   ->    52 x  52 x 256  1.595 BFLOPs
  103 conv    128  1 x 1 / 1    52 x  52 x 256   ->    52 x  52 x 128  0.177 BFLOPs
  104 conv    256  3 x 3 / 1    52 x  52 x 128   ->    52 x  52 x 256  1.595 BFLOPs
  105 conv    255  1 x 1 / 1    52 x  52 x 256   ->    52 x  52 x 255  0.353 BFLOPs
  106 yolo # large尺寸的特征图 52*52*(3*(5+80))
Loading weights from model/yolov3.weights...Done!
data/dog.jpg: Predicted in 0.024054 seconds. # 1080T inference time

# 图像中类别和置信度
dog: 99%
truck: 92%
bicycle: 99%

yolov3的核心部分:主要是yolo_layer的实现，对输出的三个tensor进行解析

获取候选框

//src/network.c get_network_boxes函数
/*
在解析yolov3数据这个部分涉及的数据结构：
typedef struct{
    float x, y, w, h; // 存放bbox的中心点坐标和w、h
} box;

typedef struct detection{
    box bbox; // bbox的坐标信息
    int classes; // 类别个数
    float *prob; // 类别置信度数组
    float *mask; 
    float objectness; // 目标置信度数组
    int sort_class; // bbox所属类别？
} detection;
*/
detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num) // 输入参数：network、图像的原始w和h，thresh是指过滤bbox的时候用到的阈值，hier这个阈值在yolov3中没有用到， map=0和relative=1用途不太明显看后面分析吧，num为检测到的bbox数目(具体是哪个阶段的：to add)
{
    detection *dets = make_network_boxes(net, thresh, num); // 根据yolo_layer所生成bbox的数量分配相应的内存空间用于后续bbox信息的存储
    fill_network_boxes(net, w, h, thresh, hier, map, relative, dets); // 往分配好内存空间的bbox里面填入具体的数据
    return dets;
}

//src/network.c make_network_boxes函数
detection *make_network_boxes(network *net, float thresh, int *num)
{
    layer l = net->layers[net->n - 1];
    int i;
    int nboxes = num_detections(net, thresh); //计算经过obj prob阈值过滤后的bbox的数量
    if(num) *num = nboxes;
    detection *dets = calloc(nboxes, sizeof(detection)); //以下几个步骤是根据初步bbox的数量分配对应的空间用以存储bbox的具体数据
    for(i = 0; i < nboxes; ++i){
        dets[i].prob = calloc(l.classes, sizeof(float));
        if(l.coords > 4){
            dets[i].mask = calloc(l.coords-4, sizeof(float));
        }
    }
    return dets;
}
->//src/network.c num_detections函数
int num_detections(network *net, float thresh)
{
    int i;
    int s = 0;
    for(i = 0; i < net->n; ++i){
        layer l = net->layers[i]; //遍历network里面的layer，因为yolov3中有三个yolo_layer 都需要统计所生成bbox的num然后汇总
        if(l.type == YOLO){ //由于yolov3生成的bbox数量很多，需要先用thresh过滤掉大量的bbox
            s += yolo_num_detections(l, thresh);
        }
        if(l.type == DETECTION || l.type == REGION){
            s += l.w*l.h*l.n;
        }
    }
    return s;
}
-->//src/yolo_layer.c yolo_num_detections函数
int yolo_num_detections(layer l, float thresh)
{
    int i, n;
    int count = 0;
    for (i = 0; i < l.w*l.h; ++i){ // feature map 的 size
        for(n = 0; n < l.n; ++n){ // anchor的数目 n = 3
            int obj_index  = entry_index(l, 0, n*l.w*l.h + i, 4);
            /*
            这里解释一下yolo_layer输入tensor的数据分布情况以小尺寸的feature map为例子:
            255*13*13 -> 3*(5+80)*13*13 ->  3(4*13*13 + 1*13*13 + 80*13*13) 每个点三个anchor，每个anchor有4个坐标值、1个obj_prob、80个classe_prob
            这里需要取出obj_prob 然后根据thresh=0.5进行过滤
            */
            if(l.output[obj_index] > thresh){ //
                ++count;
            }
        }
    }
    return count;
}

//src/network.c fill_network_boxes函数
void fill_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, detection *dets)
{
    int j;
    for(j = 0; j < net->n; ++j){
        layer l = net->layers[j];
        if(l.type == YOLO){ //往detection bbox里面填经过解析之后的 坐标、置信度等信息
            int count = get_yolo_detections(l, w, h, net->w, net->h, thresh, map, relative, dets); // 返回该阶段yolo_layer的检测数量，以此定位在dets数组中的位置 w,h为图像w,h; net->w，net->h为网络输入的w,h
            dets += count; 
        }
        if(l.type == REGION){ // for yolov2,yolov2中才会用到hier_thresh
            get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets);
            dets += l.w*l.h*l.n;
        }
        if(l.type == DETECTION){ // for yolov1
            get_detection_detections(l, w, h, thresh, dets);
            dets += l.w*l.h*l.n;
        }
    }
}
->//src/yolo_layer.c 函数get_yolo_detections
int get_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets)
{
    int i,j,n;
    float *predictions = l.output; //这里解释一下在yolo_layer test的forward时，layer.output等于上一层的输出，即上一层conv的输出: 255*13*13/255*26*26/255*52*52
    /* 
    在yolo_layer forward的时候还做了一些计算操作: 在训练阶段为了把x,y,o,classes
    回归预测值约束在[0,1],使用了logistic函数(sigmoid函数)，其实这是沿用了yolov2
    论文的思路
    void forward_yolo_layer(const layer l, network net)
    {
        int i,j,b,t,n;
        memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float));
        #ifndef GPU
        for (b = 0; b < l.batch; ++b){
            for(n = 0; n < l.n; ++n){
                int index = entry_index(l, b, n*l.w*l.h, 0);
                activate_array(l.output + index, 2*l.w*l.h, LOGISTIC);// sigmoid(tx)/sigmoid(ty)
                index = entry_index(l, b, n*l.w*l.h, 4);
                activate_array(l.output + index, (1+l.classes)*l.w*l.h,          LOGISTIC);// sigmoid(to)/sigmoid(classes prob)
            }
        }
    }
    */
    if (l.batch == 2) avg_flipped_yolo(l); // 针对当batch=2的情况下对outpu[1]做了一个水平翻转的操作(类似镜像？),然后做了一个平均的运算output[0] = (output[0]+ output[1])/2
    int count = 0;
    for (i = 0; i < l.w*l.h; ++i){
        int row = i / l.w;
        int col = i % l.w;
        for(n = 0; n < l.n; ++n){
            int obj_index  = entry_index(l, 0, n*l.w*l.h + i, 4);
            float objectness = predictions[obj_index]; // 取出目标置信度和阈值判断，这样的话保证 bbox和make_network_boxes是一致的
            if(objectness <= thresh) continue;
            int box_index  = entry_index(l, 0, n*l.w*l.h + i, 0);
            // get_yolo_box是一个比较核心的function，分析见后面
            dets[count].bbox = get_yolo_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h);
            dets[count].objectness = objectness;
            dets[count].classes = l.classes;
            for(j = 0; j < l.classes; ++j){
                int class_index = entry_index(l, 0, n*l.w*l.h + i, 4 + 1 + j);
                float prob = objectness*predictions[class_index]; // bbox最终存放的prob是目标prob和类别prob的乘积(这里需要特别注意)
                dets[count].prob[j] = (prob > thresh) ? prob : 0; //prob小于 thresh 0.5的都置为0
            }
            ++count;
        }
    }
    correct_yolo_boxes(dets, count, w, h, netw, neth, relative);// 根据图像原始大小对bbox的预测值进行修正
    return count;
}
-->/src/yolo_layer.c 函数get_yolo_box
box get_yolo_box(float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, int stride) 
/* 
输入参数解析：(*x 预测数据),(*biases 存放anchor数据),
(i、j 对应在feature map上的坐标),
(n 表示anchor数组的mask,为了让三个yolo_layer能取到自己对应的三组anchor, 小尺寸feature map对应大size anchor，比较好理解小尺寸特征图负责检查大尺寸目标),
(index,当前bbox对应的数据的起始下标),
(lw lh,特征图的w h),
(w h, 网络输入的w h),
(同一个bbox数据之间的stride lw*lh)
*/
{
    box b; // 网络为了每一个bbox都给出了4个坐标预测值: tx ty tw ty
    /*
    其中tx 和 ty是相对于当前feature map坐标的偏移 
    除以lw&&lh 是计算出bbox坐标在图像中的比例
    */
    b.x = (i + x[index + 0*stride]) / lw;
    b.y = (j + x[index + 1*stride]) / lh;
    /*
    e^tw * biases[2*n] 表示学习到的w回归值和对应prior bbox(anchor) w的乘积得到
    bbox在网络输入size基础上的w size, 除以 net_w得到相对于网络输入图像的比例
    h的计算同理, 这部分的内容涉及到yolov3论文中的图二
    */
    b.w = exp(x[index + 2*stride]) * biases[2*n]   / w;
    b.h = exp(x[index + 3*stride]) * biases[2*n+1] / h;
    return b;
    /*补充一下，这里算出的x,y,w,h都是相对于net input size的比例*/
}
--->/src/yolo_layer.c correct_yolo_boxe函数
void correct_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative)
{
    int i;
    int new_w=0;
    int new_h=0;
    if (((float)netw/w) < ((float)neth/h)) {
    // yolov3在做test和training的时候w和h是按照等比例缩放的，缩放之后的图像置于网络输入的中心部分，空缺部分用常量填补，这里是判断 图像的w和h的相对大小，然后算出网络输入中图像部分的实际w和h
        new_w = netw;
        new_h = (h * netw)/w;
    } else {
        new_h = neth;
        new_w = (w * neth)/h;
    }
    for (i = 0; i < n; ++i){
        box b = dets[i].bbox; // 针对原图的size计算bbox的相对坐标和尺寸
        // 关于x和y的坐标计算不是太理解，但是达到的目的是为了将坐标映射到原图，对图像缩放后坐标的计算相关知识还不太清楚，有清晰的同学可以补充
        b.x =  (b.x - (netw - new_w)/2./netw) / ((float)new_w/netw); 
        b.y =  (b.y - (neth - new_h)/2./neth) / ((float)new_h/neth);
        // w和h的计算就比较简单了 
        b.w *= (float)netw/new_w;
        b.h *= (float)neth/new_h;
        if(!relative){ // yolov3里面 relative = 1因此这里不做运算，关于坐标的相对值转换成绝对值，是在后面画图的阶段进行的
            b.x *= w;
            b.w *= w;
            b.y *= h;
            b.h *= h;
        }
        dets[i].bbox = b;
    }
}

nms

//src/box.c do_nms_sort函数
void do_nms_sort(detection *dets, int total, int classes, float thresh)
{
    int i, j, k;
    k = total-1;
    for(i = 0; i <= k; ++i){ // 筛选掉目标置信度为0的bbox，不过在yolov3这一步没有意义因为在之前已经筛选掉objectness小于0.5的目标了
        if(dets[i].objectness == 0){
            detection swap = dets[i];
            dets[i] = dets[k];
            dets[k] = swap;
            --k;
            --i;
        }
    }
    total = k+1;
    /*
    这里有一点需要主要的是虽然上一阶段我们得出了total个detection，但是每一个detection对应着
    classes个prob
    */
    for(k = 0; k < classes; ++k){
        for(i = 0; i < total; ++i){
            dets[i].sort_class = k;
        }
        qsort(dets, total, sizeof(detection), nms_comparator); // 将每一类的bbox按照prob值从大到小的排序(降序)
        /*
        int nms_comparator(const void *pa, const void *pb)
        {
            detection a = *(detection *)pa;
            detection b = *(detection *)pb;
            float diff = 0;
            if(b.sort_class >= 0){
                diff = a.prob[b.sort_class] - b.prob[b.sort_class];
            } else {
                diff = a.objectness - b.objectness;
            }
            if(diff < 0) return 1;
            else if(diff > 0) return -1;
            return 0;
        }
        */
        for(i = 0; i < total; ++i){
            if(dets[i].prob[k] == 0) continue;
            box a = dets[i].bbox;
            for(j = i+1; j < total; ++j){ // 使用双层循环通过iou大于0.45来筛选overlap超过阈值的bbox
                box b = dets[j].bbox;
                if (box_iou(a, b) > thresh){ // 筛选的方式是直接将对应class的prob置零
                    dets[j].prob[k] = 0;
                }
            }
        }
    }
}

ref
https://blog.csdn.net/wwwhp/article/details/84718089
https://blog.csdn.net/caicaiatnbu/article/details/102962445?utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromMachineLearnPai2%7Edefault-1.control&dist_request_id=1332037.188.16190989167598081&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromMachineLearnPai2%7Edefault-1.control
https://blog.csdn.net/hfq0219/article/details/90141698?utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromMachineLearnPai2%7Edefault-3.control&dist_request_id=&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromMachineLearnPai2%7Edefault-3.control
https://www.cnblogs.com/walktosee/p/10484024.html