yolov1-代码解析

最新推荐文章于 2023-05-08 15:31:59 发布

知识在于分享

最新推荐文章于 2023-05-08 15:31:59 发布

阅读量1k

点赞数

分类专栏：深度学习

本文链接：https://blog.csdn.net/baidu_40840693/article/details/90606999

版权

深度学习专栏收录该内容

255 篇文章 18 订阅

订阅专栏

yolov1-

网络输出数据格式和标签的格式如下图所示：

2008_000082-label：

14 0.637 0.650666666667 0.13 0.250666666667
14 0.328 0.612 0.084 0.146666666667
3 0.163 0.576 0.234 0.16
13 0.474 0.8 0.436 0.325333333333
其中
14 0.637 0.650666666667 0.13 0.250666666667
=
0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1(置信度) 
0.637(x) 0.650666666667(y) 0.13(w) 0.250666666667(h)

batch=32
subdivisions=8
height=448
width=448
channels=3
cells=7*7
boxs_num=2

20+1+4=25=class+cond+xywh

https://blog.csdn.net/u011507206/article/details/68946952

y的本源追溯：

float train_network(network net, data d)
{
    int batch = net.batch;
    int n = d.X.rows / batch;
    float *X = calloc(batch*d.X.cols, sizeof(float));
    float *y = calloc(batch*d.y.cols, sizeof(float));

    int i;
    float sum = 0;
    for(i = 0; i < n; ++i){
        get_next_batch(d, batch, i*batch, X, y);
        float err = train_network_datum(net, X, y);
        sum += err;
    }
    free(X);
    free(y);
    return (float)sum/(n*batch);
}

函数中：
d.X.rows=32
d.X.cols=602112=448*448*3
d.Y.rows=32
d.Y.cols=1225=25*49
*Y=4*1225
*X=4*602112
batch=4
n=8


void get_next_batch(data d, int n, int offset, float *X, float *y)
{
    int j;
    for(j = 0; j < n; ++j){
        int index = offset + j;
        memcpy(X+j*d.X.cols, d.X.vals[index], d.X.cols*sizeof(float));
        memcpy(y+j*d.y.cols, d.y.vals[index], d.y.cols*sizeof(float));
    }
}


void *memcpy(void *str1, const void *str2, size_t n)
参数
str1 -- 指向用于存储复制内容的目标数组，类型强制转换为 void* 指针。
str2 -- 指向要复制的数据源，类型强制转换为 void* 指针。
n -- 要被复制的字节数

d.X.rows=32
d.X.cols=602112=448*448*3
d.Y.rows=32
d.Y.cols=1225=25*49

date d的来源，是在args.d

buffer又从哪里来呢？最初的buffer是从load_data_in_thread(args);这个函数中获得的，我们来剖析下该函数

//data.c
pthread_t load_data_in_thread(load_args args)
{
    pthread_t thread;
    struct load_args *ptr = calloc(1, sizeof(struct load_args));
    *ptr = args;
    //调用load_thread这个函数
    if(pthread_create(&thread, 0, load_thread, ptr)) error("Thread creation failed");
    return thread;
}

//data.c
void *load_thread(void *ptr)
{
    //printf("Loading data: %d\n", rand());
    load_args a = *(struct load_args*)ptr;
    if(a.exposure == 0) a.exposure = 1;
    if(a.saturation == 0) a.saturation = 1;
    if(a.aspect == 0) a.aspect = 1;
 
    if (a.type == OLD_CLASSIFICATION_DATA){
        *a.d = load_data_old(a.paths, a.n, a.m, a.labels, a.classes, a.w, a.h);
    } else if (a.type == CLASSIFICATION_DATA){
        *a.d = load_data_augment(a.paths, a.n, a.m, a.labels, a.classes, a.hierarchy, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
    } else if (a.type == SUPER_DATA){
        *a.d = load_data_super(a.paths, a.n, a.m, a.w, a.h, a.scale);
    } else if (a.type == WRITING_DATA){
        *a.d = load_data_writing(a.paths, a.n, a.m, a.w, a.h, a.out_w, a.out_h);
    } else if (a.type == REGION_DATA){
        //因为a.type == REGION_DATA，所以调用这个函数，我们继续追～
        *a.d = load_data_region(a.n, a.paths, a.m, a.w, a.h, a.num_boxes, a.classes, a.jitter, a.hue, a.saturation, a.exposure);
.
.

//data.c
data load_data_region(int n, char **paths, int m, int w, int h, int size, int classes, float jitter, float hue, float saturation, float exposure)
{
    char **random_paths = get_random_paths(paths, n, m);
    int i;
    data d = {0};
    d.shallow = 0;
    //n就是batch size啦
    d.X.rows = n;
    //给X（也就是图像数据）分配内存
    d.X.vals = calloc(d.X.rows, sizeof(float*));
    d.X.cols = h*w*3;
 
 
    int k = size*size*(5+classes);
    //终于找到你啦~\(≧▽≦)/~。这里先给y分配了内存，注意一共分配了n*k个float类型的内存块，为什么分配这么多呢？慢慢往下看～
    d.y = make_matrix(n, k);
    for(i = 0; i < n; ++i){
        //读取图像
        image orig = load_image_color(random_paths[i], 0, 0);
 
        int oh = orig.h;
        int ow = orig.w;
 
        //这里jitter=0.2(cfg文件中有写)，这就是所谓的抖动了，其实就是crop（数据增广的一种）
        //剪掉的不能太多，这里设置图像的左边和右边最多剪掉dw（整幅图像宽度的1/5），上边和下边最多剪掉dh（整幅图像高度的1/5）
        int dw = (ow*jitter);
        int dh = (oh*jitter);
        //rand_uniform生成(-dw, dw)的一个随机数
        int pleft  = rand_uniform(-dw, dw);
        int pright = rand_uniform(-dw, dw);
        int ptop   = rand_uniform(-dh, dh);
        int pbot   = rand_uniform(-dh, dh);
 
        //swidth是图像剪完后的宽度，sheight是图像剪完后的高度
        int swidth =  ow - pleft - pright;
        int sheight = oh - ptop - pbot;
 
        //sx是图像剪完后宽度和原始图像的宽度比，同理sy
        float sx = (float)swidth  / ow;
        float sy = (float)sheight / oh;
 
        //设置图像随机翻转
        int flip = rand()%2;
        //开始剪切图像，咔咔咔，具体代码不看了，很简单～
        image cropped = crop_image(orig, pleft, ptop, swidth, sheight);
 
        //dx=pleft/swidth，dy=ptop/sheight
        float dx = ((float)pleft/ow)/sx;
        float dy = ((float)ptop /oh)/sy;
 
        //都剪完了，当然要把图像重新resize到448*448（论文中说了，输入图像是448*448）
        image sized = resize_image(cropped, w, h);
        //翻转图像～
        if(flip) flip_image(sized);
        //图像随机排序
        random_distort_image(sized, hue, saturation, exposure);
        //最终d.X.vals[]存储的就是要输入的数据啦，准备好X了，我们去准备下y
        d.X.vals[i] = sized.data;
 
        //开始追y，追追追～
        fill_truth_region(random_paths[i], d.y.vals[i], classes, size, flip, dx, dy, 1./sx, 1./sy);

data load_data_region(int n, char **paths, int m, int w, int h, int size, int classes, float jitter)
{
    char **random_paths = get_random_paths(paths, n, m);
    int i;
    data d = {0};
    d.shallow = 0;

    d.X.rows = n;
    d.X.vals = calloc(d.X.rows, sizeof(float*));
    d.X.cols = h*w*3;


    int k = size*size*(5+classes);
    d.y = make_matrix(n, k);
    for(i = 0; i < n; ++i){
        image orig = load_image_color(random_paths[i], 0, 0);

        int oh = orig.h;
        int ow = orig.w;

        int dw = (ow*jitter);
        int dh = (oh*jitter);

        int pleft  = rand_uniform(-dw, dw);
        int pright = rand_uniform(-dw, dw);
        int ptop   = rand_uniform(-dh, dh);
        int pbot   = rand_uniform(-dh, dh);

        int swidth =  ow - pleft - pright;
        int sheight = oh - ptop - pbot;

        float sx = (float)swidth  / ow;
        float sy = (float)sheight / oh;

        int flip = rand_r(&data_seed)%2;
        image cropped = crop_image(orig, pleft, ptop, swidth, sheight);

        float dx = ((float)pleft/ow)/sx;
        float dy = ((float)ptop /oh)/sy;

        image sized = resize_image(cropped, w, h);
        if(flip) flip_image(sized);
        d.X.vals[i] = sized.data;

        fill_truth_region(random_paths[i], d.y.vals[i], classes, size, flip, dx, dy, 1./sx, 1./sy);

        free_image(orig);
        free_image(cropped);
    }
    free(random_paths);
    return d;
}

image crop_image(image im, int dx, int dy, int w, int h)
{
    image cropped = make_image(w, h, im.c);
    int i, j, k;
    for(k = 0; k < im.c; ++k){
        for(j = 0; j < h; ++j){
            for(i = 0; i < w; ++i){
                int r = j + dy;
                int c = i + dx;
                float val = 0;
                r = constrain_int(r, 0, im.h-1);
                c = constrain_int(c, 0, im.w-1);
                if (r >= 0 && r < im.h && c >= 0 && c < im.w) {
                    val = get_pixel(im, c, r, k);
                }
                set_pixel(cropped, i, j, k, val);
            }
        }
    }
    return cropped;
}

//data.c
void fill_truth_region(char *path, float *truth, int classes, int num_boxes, int flip, float dx, float dy, float sx, float sy)
{
    char labelpath[4096];
    //有人一直不知道labels怎么来的，说源码都没设置labels的路径啊，怎么读的labels啊，那不是成了无监督学习？其实源码只是没直接设置labels的路径而已，把images替换为labels，在把.jpg替换为.txt，labels的路径就有了～
    find_replace(path, "images", "labels", labelpath);
    find_replace(labelpath, "JPEGImages", "labels", labelpath);
 
    find_replace(labelpath, ".jpg", ".txt", labelpath);
    find_replace(labelpath, ".png", ".txt", labelpath);
    find_replace(labelpath, ".JPG", ".txt", labelpath);
    find_replace(labelpath, ".JPEG", ".txt", labelpath);
    int count = 0;
    //从.txt中读取labels值，count记录框的个数
    box_label *boxes = read_boxes(labelpath, &count);
    //把框随机排序～
    randomize_boxes(boxes, count);
    //因为图像已经被修剪了，所以框的坐标也要改一改，correct_boxes函数就是把框在原始图像下的坐标转到修剪后图像下的坐标
    correct_boxes(boxes, count, dx, dy, sx, sy, flip);
    float x,y,w,h;
    int id;
    int i;
    for (i = 0; i < count; ++i) {
        x =  boxes[i].x;
        y =  boxes[i].y;
        w =  boxes[i].w;
        h =  boxes[i].h;
        id = boxes[i].id;
 
        //修剪后，太小的框就不作为正样本了
        if (w < .01 || h < .01) continue;
 
        //这里x的值为0~1之间（不一定能取到0和1，因为图像被修剪过了，坐标的范围也变了），num_boxes=7，所以col和row都是0~6之间的整数
        int col = (int)(x*num_boxes);
        int row = (int)(y*num_boxes);
 
        //x和y又被打回原型，又变成0~1之间的数了
        x = x*num_boxes - col;
        y = y*num_boxes - row;
 
        //index就懂了吧，一共7*7个网格，每个网格的索引是0~6
        int index = (col+row*num_boxes)*(5+classes);
        if (truth[index]) continue;
        //如果第i个框落在这个网格里，就把相应的置信度赋1
        truth[index++] = 1;
        //然后看标签id是几，就把对应的类别处赋值为1
        if (id < classes) truth[index+id] = 1;
        index += classes;
        //再赋值框的x,y,w,h到truth
        truth[index++] = x;
        truth[index++] = y;
        truth[index++] = w;
        truth[index++] = h;
    }
    free(boxes);
}

void correct_boxes(box_label *boxes, int n, float dx, float dy, float sx, float sy, int flip)
{
    int i;
    for(i = 0; i < n; ++i){
        boxes[i].left   = boxes[i].left  * sx - dx;
        boxes[i].right  = boxes[i].right * sx - dx;
        boxes[i].top    = boxes[i].top   * sy - dy;
        boxes[i].bottom = boxes[i].bottom* sy - dy;

        if(flip){
            float swap = boxes[i].left;
            boxes[i].left = 1. - boxes[i].right;
            boxes[i].right = 1. - swap;
        }

        boxes[i].left =  constrain(0, 1, boxes[i].left);
        boxes[i].right = constrain(0, 1, boxes[i].right);
        boxes[i].top =   constrain(0, 1, boxes[i].top);
        boxes[i].bottom =   constrain(0, 1, boxes[i].bottom);

        boxes[i].x = (boxes[i].left+boxes[i].right)/2;
        boxes[i].y = (boxes[i].top+boxes[i].bottom)/2;
        boxes[i].w = (boxes[i].right - boxes[i].left);
        boxes[i].h = (boxes[i].bottom - boxes[i].top);

        boxes[i].w = constrain(0, 1, boxes[i].w);
        boxes[i].h = constrain(0, 1, boxes[i].h);
    }
}

float constrain(float min, float max, float a)
{
    if (a < min) return min;
    if (a > max) return max;
    return a;
}

void forward_detection_layer(const detection_layer l, network_state state)
{
    int locations = l.side*l.side;
    int i,j;
    memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float));
    int b;
    if (l.softmax){
        for(b = 0; b < l.batch; ++b){
            int index = b*l.inputs;
            for (i = 0; i < locations; ++i) {
                int offset = i*l.classes;
                softmax_array(l.output + index + offset, l.classes, 1,
                        l.output + index + offset);
            }
        }
    }
    if(state.train){
        float avg_iou = 0;
        float avg_cat = 0;
        float avg_allcat = 0;
        float avg_obj = 0;
        float avg_anyobj = 0;
        int count = 0;
        *(l.cost) = 0;
        int size = l.inputs * l.batch;
        memset(l.delta, 0, size * sizeof(float));
        for (b = 0; b < l.batch; ++b){
            int index = b*l.inputs;
            for (i = 0; i < locations; ++i) {
                int truth_index = (b*locations + i)*(1+l.coords+l.classes);
                int is_obj = state.truth[truth_index];
                for (j = 0; j < l.n; ++j) {
                    int p_index = index + locations*l.classes + i*l.n + j;
                    l.delta[p_index] = l.noobject_scale*(0 - l.output[p_index]);
                    *(l.cost) += l.noobject_scale*pow(l.output[p_index], 2);
                    avg_anyobj += l.output[p_index];
                }

                int best_index = -1;
                float best_iou = 0;
                float best_rmse = 20;

                if (!is_obj){
                    continue;
                }

                int class_index = index + i*l.classes;
                for(j = 0; j < l.classes; ++j) {
                    l.delta[class_index+j] = l.class_scale * (state.truth[truth_index+1+j] - l.output[class_index+j]);
                    *(l.cost) += l.class_scale * pow(state.truth[truth_index+1+j] - l.output[class_index+j], 2);
                    if(state.truth[truth_index + 1 + j]) avg_cat += l.output[class_index+j];
                    avg_allcat += l.output[class_index+j];
                }

                box truth = float_to_box(state.truth + truth_index + 1 + l.classes);
                truth.x /= l.side;
                truth.y /= l.side;

                for(j = 0; j < l.n; ++j){
                    int box_index = index + locations*(l.classes + l.n) + (i*l.n + j) * l.coords;
                    box out = float_to_box(l.output + box_index);
                    out.x /= l.side;
                    out.y /= l.side;

                    if (l.sqrt){
                        out.w = out.w*out.w;
                        out.h = out.h*out.h;
                    }

                    float iou  = box_iou(out, truth);
                    //iou = 0;
                    float rmse = box_rmse(out, truth);
                    if(best_iou > 0 || iou > 0){
                        if(iou > best_iou){
                            best_iou = iou;
                            best_index = j;
                        }
                    }else{
                        if(rmse < best_rmse){
                            best_rmse = rmse;
                            best_index = j;
                        }
                    }
                }

                if(l.forced){
                    if(truth.w*truth.h < .1){
                        best_index = 1;
                    }else{
                        best_index = 0;
                    }
                }
                if(l.random && *(state.net.seen) < 64000){
                    best_index = rand()%l.n;
                }

                int box_index = index + locations*(l.classes + l.n) + (i*l.n + best_index) * l.coords;
                int tbox_index = truth_index + 1 + l.classes;

                box out = float_to_box(l.output + box_index);
                out.x /= l.side;
                out.y /= l.side;
                if (l.sqrt) {
                    out.w = out.w*out.w;
                    out.h = out.h*out.h;
                }
                float iou  = box_iou(out, truth);

                //printf("%d,", best_index);
                int p_index = index + locations*l.classes + i*l.n + best_index;
                *(l.cost) -= l.noobject_scale * pow(l.output[p_index], 2);
                *(l.cost) += l.object_scale * pow(1-l.output[p_index], 2);
                avg_obj += l.output[p_index];
                l.delta[p_index] = l.object_scale * (1.-l.output[p_index]);

                if(l.rescore){
                    l.delta[p_index] = l.object_scale * (iou - l.output[p_index]);
                }

                l.delta[box_index+0] = l.coord_scale*(state.truth[tbox_index + 0] - l.output[box_index + 0]);
                l.delta[box_index+1] = l.coord_scale*(state.truth[tbox_index + 1] - l.output[box_index + 1]);
                l.delta[box_index+2] = l.coord_scale*(state.truth[tbox_index + 2] - l.output[box_index + 2]);
                l.delta[box_index+3] = l.coord_scale*(state.truth[tbox_index + 3] - l.output[box_index + 3]);
                if(l.sqrt){
                    l.delta[box_index+2] = l.coord_scale*(sqrt(state.truth[tbox_index + 2]) - l.output[box_index + 2]);
                    l.delta[box_index+3] = l.coord_scale*(sqrt(state.truth[tbox_index + 3]) - l.output[box_index + 3]);
                }

                *(l.cost) += pow(1-iou, 2);
                avg_iou += iou;
                ++count;
            }
        }

        if(0){
            float *costs = calloc(l.batch*locations*l.n, sizeof(float));
            for (b = 0; b < l.batch; ++b) {
                int index = b*l.inputs;
                for (i = 0; i < locations; ++i) {
                    for (j = 0; j < l.n; ++j) {
                        int p_index = index + locations*l.classes + i*l.n + j;
                        costs[b*locations*l.n + i*l.n + j] = l.delta[p_index]*l.delta[p_index];
                    }
                }
            }
            int indexes[100];
            top_k(costs, l.batch*locations*l.n, 100, indexes);
            float cutoff = costs[indexes[99]];
            for (b = 0; b < l.batch; ++b) {
                int index = b*l.inputs;
                for (i = 0; i < locations; ++i) {
                    for (j = 0; j < l.n; ++j) {
                        int p_index = index + locations*l.classes + i*l.n + j;
                        if (l.delta[p_index]*l.delta[p_index] < cutoff) l.delta[p_index] = 0;
                    }
                }
            }
            free(costs);
        }


        *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);


        printf("Detection Avg IOU: %f, Pos Cat: %f, All Cat: %f, Pos Obj: %f, Any Obj: %f, count: %d\n", avg_iou/count, avg_cat/count, avg_allcat/(count*l.classes), avg_obj/count, avg_anyobj/(l.batch*locations*l.n), count);
    }
}

这里参数意义如下：
locations：7*7
b ：batch size的索引
i ：locations的索引
1 ：置信度
l.coords ：值为４，分别表示x,y,w,h
l.classes : 20

for(j = 0; j < l.n; ++j)
{
    int box_index = index + locations*(l.classes + l.n) + (i*l.n + j) * l.coords;
    box out = float_to_box(l.output + box_index);
    out.x /= l.side;
    out.y /= l.side;

    if (l.sqrt){
        out.w = out.w*out.w;
        out.h = out.h*out.h;
    }
    //计算iou的值
    float iou  = box_iou(out, truth);
    //iou = 0;
    //计算均方根误差（root-mean-square error）
    float rmse = box_rmse(out, truth);
    //选出iou最大或者均方根误差最小的那个框作为最后预测框
    if(best_iou > 0 || iou > 0){
        if(iou > best_iou){
            best_iou = iou;
            best_index = j;
        }
    }else{
        if(rmse < best_rmse){
            best_rmse = rmse;
            best_index = j;
        }
    }
}

out（每个网格一共l.n个out，论文中l.n=2）就是网络回归出来的值，然后把out的值和truth中的值对应比较，计算出iou，然后从l.n个iou中挑出iou最高的一个，作为最后的预测框，说白了就是：只有该框会对loss function产生影响，其他框不产生影响

-------------------------------------------------------

2008_000008.txt

12 0.524 0.573529411765 0.836 0.753393665158
14 0.447 0.238687782805 0.262 0.278280542986

2009_003377.txt

15 0.049 0.202488687783 0.094 0.246606334842
17 0.501 0.641402714932 0.998 0.717194570136

2009_003377.txt

15 0.049 0.202488687783 0.094 0.246606334842
17 0.501 0.641402714932 0.998 0.717194570136

float constrain(float min, float max, float a)
{
    if (a < min) return min;
    if (a > max) return max;
    return a;
}

知识在于分享

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
yolov1-代码解析

yolov1-网络输出数据格式和标签的格式如下图所示：2008_000082-label：14 0.637 0.650666666667 0.13 0.25066666666714 0.328 0.612 0.084 0.1466666666673 0.163 0.576 0.234 0.1613 0.474 0.8 0.436 0.325333333333...
复制链接

扫一扫