YOLO-V3可视化训练过程中的参数,绘制loss、IOU、avg Recall等的曲线图参考这篇文章:
https://blog.csdn.net/qq_34806812/article/details/81459982?utm_source=blogxgwz3
文章很详细,但说实话这个过程感觉有些麻烦,打印信息一大堆,比如我只想看看loss或者agv_loss,这很不简洁,还好darknet是完全开源的,索性看了下训练源码,修改了一下部分,然后再次编译就行了。我想画一下loss或者agv_loss,所以只保存loss和agv_loss的值。darknet训练主函数在当前目录的 examples/detector.c中的void train_detector()函数中,因此只需在这个函数中修改一下就可以了,由于函数不长我就把所有代码及修改贴在这里:
void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear)
{
list *options = read_data_cfg(datacfg);
char *train_images = option_find_str(options, "train", "data/train.list");
char *backup_directory = option_find_str(options, "backup", "/backup/");
/*************************DQ add code start*****************************/
time_t t;
struct tm * lt;
time (&t);
lt = localtime (&t);
char buffStr[256];
sprintf(buffStr, "%s/TrainLog_%d-%d-%d-%d-%d.txt\n", backup_directory,lt->tm_year+1900, lt->tm_mon+1, lt->tm_mday, lt->tm_hour, lt->tm_min);
FILE* LogFId =fopen(buffStr,"w+");
/*************************DQ add code end*****************************/
srand(time(0));
char *base = basecfg(cfgfile);
printf("%s\n", base);
float avg_loss = -1;
network **nets = calloc(ngpus, sizeof(network));
srand(time(0));
int seed = rand();
int i;
for(i = 0; i < ngpus; ++i){
srand(seed);
#ifdef GPU
cuda_set_device(gpus[i]);
#endif
nets[i] = load_network(cfgfile, weightfile, clear);
nets[i]->learning_rate *= ngpus;
}
srand(time(0));
network *net = nets[0];
int imgs = net->batch * net->subdivisions * ngpus;
printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
data train, buffer;
layer l = net->layers[net->n - 1];
int classes = l.classes;
float jitter = l.jitter;
list *plist = get_paths(train_images);
//int N = plist->size;
char **paths = (char **)list_to_array(plist);
load_args args = get_base_args(net);
args.coords = l.coords;
args.paths = paths;
args.n = imgs;
args.m = plist->size;
args.classes = classes;
args.jitter = jitter;
args.num_boxes = l.max_boxes;
args.d = &buffer;
args.type = DETECTION_DATA;
//args.type = INSTANCE_DATA;
args.threads = 64;
pthread_t load_thread = load_data(args);
double time;
int count = 0;
//while(i*imgs < N*120){
while(get_current_batch(net) < net->max_batches){
if(l.random && count++%10 == 0){
printf("Resizing\n");
int dim = (rand() % 10 + 10) * 32;
if (get_current_batch(net)+200 > net->max_batches) dim = 608;
//int dim = (rand() % 4 + 16) * 32;
printf("%d\n", dim);
args.w = dim;
args.h = dim;
pthread_join(load_thread, 0);
train = buffer;
free_data(train);
load_thread = load_data(args);
#pragma omp parallel for
for(i = 0; i < ngpus; ++i){
resize_network(nets[i], dim, dim);
}
net = nets[0];
}
time=what_time_is_it_now();
pthread_join(load_thread, 0);
train = buffer;
load_thread = load_data(args);
/*
int k;
for(k = 0; k < l.max_boxes; ++k){
box b = float_to_box(train.y.vals[10] + 1 + k*5);
if(!b.x) break;
printf("loaded: %f %f %f %f\n", b.x, b.y, b.w, b.h);
}
*/
/*
int zz;
for(zz = 0; zz < train.X.cols; ++zz){
image im = float_to_image(net->w, net->h, 3, train.X.vals[zz]);
int k;
for(k = 0; k < l.max_boxes; ++k){
box b = float_to_box(train.y.vals[zz] + k*5, 1);
printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);
draw_bbox(im, b, 1, 1,0,0);
}
show_image(im, "truth11");
cvWaitKey(0);
save_image(im, "truth11");
}
*/
printf("Loaded: %lf seconds\n", what_time_is_it_now()-time);
time=what_time_is_it_now();
float loss = 0;
#ifdef GPU
if(ngpus == 1){
loss = train_network(net, train);
} else {
loss = train_networks(nets, ngpus, train, 4);
}
#else
loss = train_network(net, train);
#endif
if (avg_loss < 0) avg_loss = loss;
avg_loss = avg_loss*.9 + loss*.1;
i = get_current_batch(net);
printf("%ld: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), loss, avg_loss, get_current_rate(net), what_time_is_it_now()-time, i*imgs);
/*************************DQ add code start*****************************/
if(LogFId) fprintf (LogFId,"%f,%f\n",loss, avg_loss);//写loss和avg_loss到指定文件,如果需要其他什么添加一下就行了
/*************************DQ add code end*****************************/
if(i%100==0){
#ifdef GPU
if(ngpus != 1) sync_nets(nets, ngpus, 0);
#endif
char buff[256];
sprintf(buff, "%s/%s.backup", backup_directory, base);
save_weights(net, buff);
}
if(i>net->max_batches*0.4&&i%2000==0){//设定保存模型的条件,当大于0.4倍最大次数并且每隔2000次保存一次
#ifdef GPU
if(ngpus != 1) sync_nets(nets, ngpus, 0);
#endif
char buff[256];
sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
save_weights(net, buff);
}
free_data(train);
}
#ifdef GPU
if(ngpus != 1) sync_nets(nets, ngpus, 0);
#endif
char buff[256];
sprintf(buff, "%s/%s_final.weights", backup_directory, base);
save_weights(net, buff);
if(LogFId) fclose(LogFId);//关闭文件
}
修改好了以后,记得重新编译一下。切换到darknet主目录,执行:
make clean
make -j12 #是情况修改
好了至此训练过程中的loss和agv_loss就保存到了backup目录下了。
下面写了个显示loss曲线的程序一并贴在此处:
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# 2019/09/16 by DQ
import os
import matplotlib.pyplot as plt
import numpy as np
MainDir = '/data/project/darknet'
TrainLogPath = os.path.join(MainDir, 'backup', 'TrainLog_2019-9-19-17-7.txt')
Loss, AgvLoss = [], []
with open(TrainLogPath, 'r') as FId:
TxtLines = FId.readlines()
for TxtLine in TxtLines:
SplitStr = TxtLine.strip().split(',')
Loss.append(float(SplitStr[0]))
AgvLoss.append(float(SplitStr[1]))
IterNum = len(AgvLoss)
StartVal, EndVal, Stride = 1000, IterNum, 50 #视情况修改
Xs = np.arange(StartVal, EndVal, Stride)
Ys = np.array(AgvLoss[StartVal:EndVal:Stride])
plt.plot(Xs, Ys,label='avg_loss')
plt.xlabel('x label')
plt.ylabel('y label')
plt.title("Loss-Iter curve")
plt.legend()
plt.show()
我的迭代次数很大,我就画了一部分图:
全部画出了是这个鬼样,实在不舒服:
这样训练过程中的loss就保存下来了,方便后续查看