2. 解读training部分
2.1 training.py
首先从main函数开始。
def main():
'''
读取超参数函数以及配置文件。
'''
logging.basicConfig(level=logging.DEBUG,
format="[%(asctime)s %(filename)s] %(message)s")
if len(sys.argv) != 2:
logging.error("Usage: python training.py params.py")
sys.exit()
params_path = sys.argv[1]
if not os.path.isfile(params_path):
logging.error("no params file found! path: {}".format(params_path))
sys.exit()
config = importlib.import_module(params_path[:-3]).TRAINING_PARAMS
config["batch_size"] *= len(config["parallels"])
# Create sub_working_dir
'''
working_dir/model_params/size(backbone_name)x(img_w)_try(img_h)/try
这个路径即存储训练后的模型。
'''
sub_working_dir = '{}/{}/size{}x{}_try{}/{}'.format(
config['working_dir'], config['model_params']['backbone_name'],
config['img_w'], config['img_h'], config['try'],
time.strftime("%Y%m%d%H%M%S", time.localtime()))
if not os.path.exists(sub_working_dir):
os.makedirs(sub_working_dir)
config["sub_working_dir"] = sub_working_dir
logging.info("sub working dir: %s" % sub_working_dir)
# Creat tf_summary writer
'''
根据上边训练出来的模型,读取文件中内容,用tensorboard导出训练损失曲线图。
'''
config["tensorboard_writer"] = SummaryWriter(sub_working_dir)
logging.info("Please using 'python -m tensorboard.main --logdir={}'".format(sub_working_dir))
# Start training
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(map(str, config["parallels"]))
train(config)
if __name__ == "__main__":
main()
在此插入params.py,定义网络超参数。
TRAINING_PARAMS = \
{
"model_params": {
"backbone_name": "darknet_53",
"backbone_pretrained": "../weights/darknet53_weights_pytorch.pth", # set empty to disable
},
"yolo": {
"anchors": [[[116, 90], [156, 198], [373, 326]],
[[30, 61], [62, 45], [59, 119]],
[[10, 13], [16, 30], [33, 23]]],
"classes": 20, #该网络在voc2012上训练
},
"lr": {
"backbone_lr": 0.001,
"other_lr": 0.01,
"freeze_backbone": False, # freeze backbone wegiths to finetune
"decay_gamma": 0.1, #衰减指数,
"decay_step": 20, # 衰减速度,即每迭代多少轮就衰减的度量值。值为20就代表当前迭代轮数达到20时就给学习率乘上0.1(衰减指数)的1次方,达到40时就给学习率乘上0.1(衰减指数)的2次方。
},
"optimizer": {
"type": "sgd",
"weight_decay": 4e-05,
},
"batch_size": 4,
"train_path": "../data/coco/trainvalno5k.txt",
"epochs": 100,
"img_h": 416,
"img_w": 416,
"parallels": [0], # config GPU device
"working_dir": "YOUR_WORKING_DIR", # replace with your working dir
"pretrain_snapshot": "", # load checkpoint
"evaluate_type": "",
"try": 0,
"export_onnx": False,
}
2.1.1 train(config)函数
def train(config):
'''
param:config,即函数params.py中的参数。
return:
'''
config["global_step"] = config.get("start_step", 0)
is_training = False if config.get("export_onnx") else True
# Load and initialize network
net = ModelMain(config, is_training=is_training)
net.tr