自定义darknet YOLO-V3训练过程输出信息到log文件和模型保存间隔

6 篇文章 1 订阅

 

YOLO-V3可视化训练过程中的参数,绘制loss、IOU、avg Recall等的曲线图参考这篇文章:

https://blog.csdn.net/qq_34806812/article/details/81459982?utm_source=blogxgwz3

文章很详细,但说实话这个过程感觉有些麻烦,打印信息一大堆,比如我只想看看loss或者agv_loss,这很不简洁,还好darknet是完全开源的,索性看了下训练源码,修改了一下部分,然后再次编译就行了。我想画一下loss或者agv_loss,所以只保存loss和agv_loss的值。darknet训练主函数在当前目录的 examples/detector.c中的void train_detector()函数中,因此只需在这个函数中修改一下就可以了,由于函数不长我就把所有代码及修改贴在这里:

void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear)
{
    list *options = read_data_cfg(datacfg);
    char *train_images = option_find_str(options, "train", "data/train.list");
    char *backup_directory = option_find_str(options, "backup", "/backup/");

    /*************************DQ add code start*****************************/
    time_t t;
    struct tm * lt;
    time (&t);
    lt = localtime (&t);
    char buffStr[256];
    sprintf(buffStr, "%s/TrainLog_%d-%d-%d-%d-%d.txt\n", backup_directory,lt->tm_year+1900, lt->tm_mon+1, lt->tm_mday, lt->tm_hour, lt->tm_min);
    FILE* LogFId =fopen(buffStr,"w+");
   /*************************DQ add code end*****************************/
    
    srand(time(0));
    char *base = basecfg(cfgfile);
    printf("%s\n", base);
    float avg_loss = -1;
    network **nets = calloc(ngpus, sizeof(network));

    srand(time(0));
    int seed = rand();
    int i;
    for(i = 0; i < ngpus; ++i){
        srand(seed);
#ifdef GPU
        cuda_set_device(gpus[i]);
#endif
        nets[i] = load_network(cfgfile, weightfile, clear);
        nets[i]->learning_rate *= ngpus;
    }
    srand(time(0));
    network *net = nets[0];

    int imgs = net->batch * net->subdivisions * ngpus;
    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
    data train, buffer;

    layer l = net->layers[net->n - 1];

    int classes = l.classes;
    float jitter = l.jitter;

    list *plist = get_paths(train_images);
    //int N = plist->size;
    char **paths = (char **)list_to_array(plist);

    load_args args = get_base_args(net);
    args.coords = l.coords;
    args.paths = paths;
    args.n = imgs;
    args.m = plist->size;
    args.classes = classes;
    args.jitter = jitter;
    args.num_boxes = l.max_boxes;
    args.d = &buffer;
    args.type = DETECTION_DATA;
    //args.type = INSTANCE_DATA;
    args.threads = 64;

    pthread_t load_thread = load_data(args);
    double time;
    int count = 0;

    //while(i*imgs < N*120){
    while(get_current_batch(net) < net->max_batches){
        if(l.random && count++%10 == 0){
            printf("Resizing\n");
            int dim = (rand() % 10 + 10) * 32;
            if (get_current_batch(net)+200 > net->max_batches) dim = 608;
            //int dim = (rand() % 4 + 16) * 32;
            printf("%d\n", dim);
            args.w = dim;
            args.h = dim;

            pthread_join(load_thread, 0);
            train = buffer;
            free_data(train);
            load_thread = load_data(args);

            #pragma omp parallel for
            for(i = 0; i < ngpus; ++i){
                resize_network(nets[i], dim, dim);
            }
            net = nets[0];
        }
        time=what_time_is_it_now();
        pthread_join(load_thread, 0);
        train = buffer;
        load_thread = load_data(args);

        /*
           int k;
           for(k = 0; k < l.max_boxes; ++k){
           box b = float_to_box(train.y.vals[10] + 1 + k*5);
           if(!b.x) break;
           printf("loaded: %f %f %f %f\n", b.x, b.y, b.w, b.h);
           }
         */
        /*
           int zz;
           for(zz = 0; zz < train.X.cols; ++zz){
           image im = float_to_image(net->w, net->h, 3, train.X.vals[zz]);
           int k;
           for(k = 0; k < l.max_boxes; ++k){
           box b = float_to_box(train.y.vals[zz] + k*5, 1);
           printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);
           draw_bbox(im, b, 1, 1,0,0);
           }
           show_image(im, "truth11");
           cvWaitKey(0);
           save_image(im, "truth11");
           }
         */

        printf("Loaded: %lf seconds\n", what_time_is_it_now()-time);

        time=what_time_is_it_now();
        float loss = 0;
#ifdef GPU
        if(ngpus == 1){
            loss = train_network(net, train);
        } else {
            loss = train_networks(nets, ngpus, train, 4);
        }
#else
        loss = train_network(net, train);
#endif
        if (avg_loss < 0) avg_loss = loss;
        avg_loss = avg_loss*.9 + loss*.1;

        i = get_current_batch(net);
        printf("%ld: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), loss, avg_loss, get_current_rate(net), what_time_is_it_now()-time, i*imgs);
        /*************************DQ add code start*****************************/
        if(LogFId) fprintf (LogFId,"%f,%f\n",loss, avg_loss);//写loss和avg_loss到指定文件,如果需要其他什么添加一下就行了
        /*************************DQ add code end*****************************/

        if(i%100==0){
#ifdef GPU
            if(ngpus != 1) sync_nets(nets, ngpus, 0);
#endif
            char buff[256];
            sprintf(buff, "%s/%s.backup", backup_directory, base);
            save_weights(net, buff);
        }
        if(i>net->max_batches*0.4&&i%2000==0){//设定保存模型的条件,当大于0.4倍最大次数并且每隔2000次保存一次
#ifdef GPU
            if(ngpus != 1) sync_nets(nets, ngpus, 0);
#endif
            char buff[256];
            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
            save_weights(net, buff);
        }
        free_data(train);
    }
#ifdef GPU
    if(ngpus != 1) sync_nets(nets, ngpus, 0);
#endif
    char buff[256];
    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
    save_weights(net, buff);

    if(LogFId) fclose(LogFId);//关闭文件
}

修改好了以后,记得重新编译一下。切换到darknet主目录,执行:

make clean

make -j12 #是情况修改

好了至此训练过程中的loss和agv_loss就保存到了backup目录下了。

下面写了个显示loss曲线的程序一并贴在此处:

#!/usr/bin/python
# -*- coding: UTF-8 -*-
# 2019/09/16 by DQ

import os
import matplotlib.pyplot as plt
import numpy as np

MainDir = '/data/project/darknet'
TrainLogPath = os.path.join(MainDir, 'backup', 'TrainLog_2019-9-19-17-7.txt')
Loss, AgvLoss = [], []
with open(TrainLogPath, 'r') as FId:
	TxtLines = FId.readlines()
	for TxtLine in TxtLines:
		SplitStr = TxtLine.strip().split(',')
		Loss.append(float(SplitStr[0]))
		AgvLoss.append(float(SplitStr[1]))

IterNum = len(AgvLoss)
StartVal, EndVal, Stride = 1000, IterNum, 50 #视情况修改
Xs = np.arange(StartVal, EndVal, Stride)
Ys = np.array(AgvLoss[StartVal:EndVal:Stride])
plt.plot(Xs, Ys,label='avg_loss')
plt.xlabel('x label')
plt.ylabel('y label')
plt.title("Loss-Iter curve")
plt.legend()
plt.show()

我的迭代次数很大,我就画了一部分图:

 

全部画出了是这个鬼样,实在不舒服:

 

这样训练过程中的loss就保存下来了,方便后续查看

  • 2
    点赞
  • 41
    收藏
    觉得还不错? 一键收藏
  • 13
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 13
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值