日萌社
人工智能AI:Keras PyTorch MXNet TensorFlow PaddlePaddle 深度学习实战(不定时更新)
CNN:RCNN、SPPNet、Fast RCNN、Faster RCNN、YOLO V1 V2 V3、SSD、FCN、SegNet、U-Net、DeepLab V1 V2 V3、Mask RCNN
单目标跟踪 Siamese系列网络:SiamFC、SiamRPN、one-shot跟踪、one-shotting单样本学习、DaSiamRPN、SiamRPN++、SiamMask
1.5 train
学习目标:
- 了解网络的训练的过程
- 能够编写网络训练的代码
网络的训练的实现在文件夹tools中,主要分为siammaskbase网络和siammasksharp网络的训练,在训练时我们首先训练base网络,然后对掩膜细化的refine网络进行训练,因为训练流程是类似的我们以trian_siammask.py为例给大家介绍。该文件中包含的内容如下图所示:
1.main
main是函数主入口,主要做了一些config文件,路径以及log的操作(这些操作都是从命令行获得的,或者从命令行带有的文件里得到的参数等),接下来是根据配置信息加载数据:build_data_loader(cfg);然后是进行模型加载,最后调用train进行模型训练,如下图所示:
main函数的代码如下:
def main():
"""
基础网络的训练
:return:
"""
global args, best_acc, tb_writer, logger
args = parser.parse_args()
# 初始化日志信息
init_log('global', logging.INFO)
if args.log != "":
add_file_handler('global', args.log, logging.INFO)
# 获取log信息
logger = logging.getLogger('global')
logger.info("\n" + collect_env_info())
logger.info(args)
# 获取配置信息
cfg = load_config(args)
logger.info("config \n{}".format(json.dumps(cfg, indent=4)))
if args.log_dir:
tb_writer = SummaryWriter(args.log_dir)
else:
tb_writer = Dummy()
# 构建数据集
train_loader, val_loader = build_data_loader(cfg)
# 加载训练网络
if args.arch == 'Custom':
from custom import Custom
model = Custom(pretrain=True, anchors=cfg['anchors'])
else:
exit()
logger.info(model)
# 加载预训练网络
if args.pretrained:
model = load_pretrain(model, args.pretrained)
# GPU版本
# model = model.cuda()
# dist_model = torch.nn.DataParallel(model, list(range(torch.cuda.device_count()))).cuda()
# 网络模型
dist_model = torch.nn.DataParallel(model)
# 模型参数的更新比例
if args.resume and args.start_epoch != 0:
model.features.unfix((args.start_epoch - 1) / args.epochs)
# 获取优化器和学习率的更新策略
optimizer, lr_scheduler = build_opt_lr(model, cfg, args, args.start_epoch)
# optionally resume from a checkpoint 加载模型
if args.resume:
assert os.path.isfile(args.resume), '{} is not a valid file'.format(args.resume)
model, optimizer, args.start_epoch, best_acc, arch = restore_from(model, optimizer, args.resume)
# GPU
# dist_model = torch.nn.DataParallel(model, list(range(torch.cuda.device_count()))).cuda()
dist_model = torch.nn.DataParallel(model)
logger.info(lr_scheduler)
logger.info('model prepare done')
# 模型训练
train(train_loader, dist_model, optimizer, lr_scheduler, args.start_epoch, cfg)
2.数据集获取
数据集获取通过build_data_loader函数获取,流程如下所述:
代码实现如下:
def build_data_loader(cfg):
"""
获取数据集
:param cfg:
:return:
"""
logger = logging.getLogger('global')
logger.info("build train dataset") # train_dataset
# 获取训练集数据,包含数据增强的内容
train_set = DataSets(cfg['train_datasets'], cfg['anchors'], args.epochs)
# 对数据进行打乱处理
train_set.shuffle()
# 获取验证集数据,若为配置验证集数据则使用训练集数据替代
logger.info("build val dataset") # val_dataset
if not 'val_datasets' in cfg.keys():
cfg['val_datasets'] = cfg['train_datasets']
val_set = DataSets(cfg['val_datasets'], cfg['anchors'])
val_set.shuffle()
# DataLoader是Torch内置的方法,它允许使用多线程加速数据的读取
train_loader = DataLoader(train_set, batch_size=args.batch, num_workers=args.workers,
pin_memory=True, sampler=None)
val_loader = DataLoader(val_set, batch_size=args.batch, num_workers=args.workers,
pin_memory=True, sampler=None)
logger.info('build dataset done')
return train_loader, val_loader
3.优化方法
通过方法build_opt_lr实现优化方法和学习率的更新:
def build_opt_lr(model, cfg, args, epoch):
"""
获取优化方法和学习率
:param model:
:param cfg:
:param args:
:param epoch:
:return:
"""
# 获取要训练的网络参数
backbone_feature = model.features.param_groups(cfg['lr']['start_lr'], cfg['lr']['feature_lr_mult'])
if len(backbone_feature) == 0:
# 获取要训练的rpn网络的参数
trainable_params = model.rpn_model.param_groups(cfg['lr']['start_lr'], cfg['lr']['rpn_lr_mult'], 'mask')
else:
# 获取基础网络,rpn和mask网络的训练参数
trainable_params = backbone_feature + \
model.rpn_model.param_groups(cfg['lr']['start_lr'], cfg['lr']['rpn_lr_mult']) + \
model.mask_model.param_groups(cfg['lr']['start_lr'], cfg['lr']['mask_lr_mult'])
# 随机梯度下降算法优化
optimizer = torch.optim.SGD(trainable_params, args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
# 获取学习率
lr_scheduler = build_lr_scheduler(optimizer, cfg['lr'], epochs=args.epochs)
# 更新学习率
lr_scheduler.step(epoch)
# 返回优化器和学习率
return optimizer, lr_scheduler
4.网络训练方法
通过方法train实现网络的训练,如下所示,首先获取学习率,数据和模型,进行训练,并计算损失和精度进行展示。
def train(train_loader, model, optimizer, lr_scheduler, epoch, cfg):
"""
模型训练
:param train_loader:训练数据
:return:
"""
global tb_index, best_acc, cur_lr, logger
# 获取当前的学习率
cur_lr = lr_scheduler.get_cur_lr()
logger = logging.getLogger('global')
#
avg = AverageMeter()
model.train()
# GPU
# model = model.cuda()
end = time.time()
def is_valid_number(x):
return not(math.isnan(x) or math.isinf(x) or x > 1e4)
# 获取每个epoch中的样本个数
num_per_epoch = len(train_loader.dataset) // args.epochs // args.batch
print("num_per_epoch",num_per_epoch)
start_epoch = epoch
epoch = epoch
# 获取每个batch的输入
for iter, input in enumerate(train_loader):
if epoch != iter // num_per_epoch + start_epoch: # next epoch
epoch = iter // num_per_epoch + start_epoch
# 创建存储路径
if not os.path.exists(args.save_dir): # makedir/save model
os.makedirs(args.save_dir)
# 存储训练结果
save_checkpoint({
'epoch': epoch,
'arch': args.arch,
'state_dict': model.module.state_dict(),
'best_acc': best_acc,
'optimizer': optimizer.state_dict(),
'anchor_cfg': cfg['anchors']
}, False,
os.path.join(args.save_dir, 'checkpoint_e%d.pth' % (epoch)),
os.path.join(args.save_dir, 'best.pth'))
if epoch == args.epochs:
return
# 更新优化器和学习方法
if model.module.features.unfix(epoch/args.epochs):
logger.info('unfix part model.')
optimizer, lr_scheduler = build_opt_lr(model.module, cfg, args, epoch)
# 获取当前学习率
lr_scheduler.step(epoch)
cur_lr = lr_scheduler.get_cur_lr()
logger.info('epoch:{}'.format(epoch))
# 更新日志
tb_index = iter
if iter % num_per_epoch == 0 and iter != 0:
for idx, pg in enumerate(optimizer.param_groups):
logger.info("epoch {} lr {}".format(epoch, pg['lr']))
tb_writer.add_scalar('lr/group%d' % (idx+1), pg['lr'], tb_index)
data_time = time.time() - end
avg.update(data_time=data_time)
# 输入数据
x = {
# GPU
# 'cfg': cfg,
# 'template': torch.autograd.Variable(input[0]).cuda(),
# 'search': torch.autograd.Variable(input[1]).cuda(),
# 'label_cls': torch.autograd.Variable(input[2]).cuda(),
# 'label_loc': torch.autograd.Variable(input[3]).cuda(),
# 'label_loc_weight': torch.autograd.Variable(input[4]).cuda(),
# 'label_mask': torch.autograd.Variable(input[6]).cuda(),
# 'label_mask_weight': torch.autograd.Variable(input[7]).cuda(),
'cfg': cfg,
'template': torch.autograd.Variable(input[0]),
'search': torch.autograd.Variable(input[1]),
'label_cls': torch.autograd.Variable(input[2]),
'label_loc': torch.autograd.Variable(input[3]),
'label_loc_weight': torch.autograd.Variable(input[4]),
'label_mask': torch.autograd.Variable(input[6]),
'label_mask_weight': torch.autograd.Variable(input[7]),
}
# 输出数据
outputs = model(x)
# 计算损失函数
rpn_cls_loss, rpn_loc_loss, rpn_mask_loss = torch.mean(outputs['losses'][0]), torch.mean(outputs['losses'][1]), torch.mean(outputs['losses'][2])
# 计算精度
mask_iou_mean, mask_iou_at_5, mask_iou_at_7 = torch.mean(outputs['accuracy'][0]), torch.mean(outputs['accuracy'][1]), torch.mean(outputs['accuracy'][2])
# 获取分类,回归和分割所占的比例
cls_weight, reg_weight, mask_weight = cfg['loss']['weight']
# 计算总损失
loss = rpn_cls_loss * cls_weight + rpn_loc_loss * reg_weight + rpn_mask_loss * mask_weight
# 将梯度置零
optimizer.zero_grad()
# 反向传播
loss.backward()
# 最大范数优化,防止梯度过大造成梯度爆炸
if cfg['clip']['split']:
torch.nn.utils.clip_grad_norm_(model.module.features.parameters(), cfg['clip']['feature'])
torch.nn.utils.clip_grad_norm_(model.module.rpn_model.parameters(), cfg['clip']['rpn'])
torch.nn.utils.clip_grad_norm_(model.module.mask_model.parameters(), cfg['clip']['mask'])
else:
torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) # gradient clip
if is_valid_number(loss.item()):
optimizer.step()
siammask_loss = loss.item()
batch_time = time.time() - end
# 参数更新
avg.update(batch_time=batch_time, rpn_cls_loss=rpn_cls_loss, rpn_loc_loss=rpn_loc_loss,
rpn_mask_loss=rpn_mask_loss, siammask_loss=siammask_loss,
mask_iou_mean=mask_iou_mean, mask_iou_at_5=mask_iou_at_5, mask_iou_at_7=mask_iou_at_7)
# 参数写入tensorboard
tb_writer.add_scalar('loss/cls', rpn_cls_loss, tb_index)
tb_writer.add_scalar('loss/loc', rpn_loc_loss, tb_index)
tb_writer.add_scalar('loss/mask', rpn_mask_loss, tb_index)
tb_writer.add_scalar('mask/mIoU', mask_iou_mean, tb_index)
tb_writer.add_scalar('mask/AP@.5', mask_iou_at_5, tb_index)
tb_writer.add_scalar('mask/AP@.7', mask_iou_at_7, tb_index)
end = time.time()
# 日志输出
if (iter + 1) % args.print_freq == 0:
logger.info('Epoch: [{0}][{1}/{2}] lr: {lr:.6f}\t{batch_time:s}\t{data_time:s}'
'\t{rpn_cls_loss:s}\t{rpn_loc_loss:s}\t{rpn_mask_loss:s}\t{siammask_loss:s}'
'\t{mask_iou_mean:s}\t{mask_iou_at_5:s}\t{mask_iou_at_7:s}'.format(
epoch+1, (iter + 1) % num_per_epoch, num_per_epoch, lr=cur_lr, batch_time=avg.batch_time,
data_time=avg.data_time, rpn_cls_loss=avg.rpn_cls_loss, rpn_loc_loss=avg.rpn_loc_loss,
rpn_mask_loss=avg.rpn_mask_loss, siammask_loss=avg.siammask_loss, mask_iou_mean=avg.mask_iou_mean,
mask_iou_at_5=avg.mask_iou_at_5,mask_iou_at_7=avg.mask_iou_at_7))
print_speed(iter + 1, avg.batch_time.avg, args.epochs * num_per_epoch)
5.模型保存
对训练好的模型参数保存在pth文件中。
def save_checkpoint(state, is_best, filename='checkpoint.pth', best_file='model_best.pth'):
torch.save(state, filename)
if is_best:
shutil.copyfile(filename, best_file)
6. base网络训练
训练网络时在终端运行:
cd $SiamMask_master/experiments/siammask_base/
bash run.sh
其中run.sh中的内容为:
# 设置环境变量
ROOT=/Users/yaoxiaoying/Documents/01-工作/03.计算机视觉/03.智慧交通/04.单目标追踪/SiamMask-master
export PYTHONPATH=$ROOT:$PYTHONPATH
# 设置log路径
mkdir -p logs
# 参数设置,并运行网络训练函数
python -u $ROOT/tools/train_siammask.py \
--config=config.json -b 8 \
-j 0 \
--epochs 8 \
--log logs/log.txt \
2>&1 | tee logs/train.log
训练过程中展示:
INFO:global:Progress: 10 / 600000 [0%], Speed: 15.042 s/iter, ETA 104:10:56 (D:H:M)
[2020-01-16 11:15:42,693-rk0-train_siammask.py#298] Epoch: [1][20/75000] lr: 0.001000 batch_time: 14.996075 (14.985780) data_time: 0.072848 (0.087712)rpn_cls_loss: 0.603426 (0.679796) rpn_loc_loss: 0.680157 (0.781068) rpn_mask_loss: 0.691833 (0.697598) siammask_loss: 26.325602 (26.730618) mask_iou_mean: 0.000000 (0.000000) mask_iou_at_5: 0.000000 (0.000000) mask_iou_at_7: 0.000000 (0.000000)
INFO:global:Epoch: [1][20/75000] lr: 0.001000 batch_time: 14.996075 (14.985780) data_time: 0.072848 (0.087712) rpn_cls_loss: 0.603426 (0.679796) rpn_loc_loss: 0.680157 (0.781068) rpn_mask_loss: 0.691833 (0.697598) siammask_loss: 26.325602 (26.730618) mask_iou_mean: 0.000000 (0.000000) mask_iou_at_5: 0.000000 (0.000000) mask_iou_at_7: 0.000000 (0.000000)
[2020-01-16 11:15:42,694-rk0-log_helper.py#102] Progress: 20 / 600000 [0%], Speed: 14.986 s/iter, ETA 104:01:32 (D:H:M)
INFO:global:Progress: 20 / 600000 [0%], Speed: 14.986 s/iter, ETA 104:01:32 (D:H:M)
[2020-01-16 11:18:13,987-rk0-train_siammask.py#298] Epoch: [1][30/75000] lr: 0.001000 batch_time: 15.297617 (15.033497) data_time: 0.078839 (0.086977)rpn_cls_loss: 0.538483 (0.651094) rpn_loc_loss: 0.670039 (0.715383) rpn_mask_loss: 0.687830 (0.694915) siammask_loss: 26.104422 (26.526494) mask_iou_mean: 0.000000 (0.000000) mask_iou_at_5: 0.000000 (0.000000) mask_iou_at_7: 0.000000 (0.000000)
INFO:global:Epoch: [1][30/75000] lr: 0.001000 batch_time: 15.297617 (15.033497) data_time: 0.078839 (0.086977) rpn_cls_loss: 0.538483 (0.651094) rpn_loc_loss: 0.670039 (0.715383) rpn_mask_loss: 0.687830 (0.694915) siammask_loss: 26.104422 (26.526494) mask_iou_mean: 0.000000 (0.000000) mask_iou_at_5: 0.000000 (0.000000) mask_iou_at_7: 0.000000 (0.000000)
训练一个epoch后会保存一个训练结果,在snapnat中,下面是训练了5个epoch的结果:
训练完成后,在tesorboard中展示损失函数的变化,如下所示:
分类的损失:
定位的损失:
分割的损失:
从上述的损失函数的变换中可以看出,分类,定位和分割的损失函数都随这训练次数的增加在减小。
7.refine模块的训练
训练网络时在终端运行,这时我们要指定一个训练好的base网络
cd $SiamMask_master/experiments/siammask_sharp
bash run.sh checkpoint_e4.pth
其中,run.sh中的内容如下:
if [ -z "$1" ]
then
echo "Need input base model!"
echo "Usage: bash `basename "$0"` \$BASE_MODEL"
exit
fi
# 环境变量设置
ROOT=/Users/yaoxiaoying/Documents/01-工作/03.计算机视觉/03.智慧交通/04.单目标追踪/SiamMask-master
export PYTHONPATH=$ROOT:$PYTHONPATH
# 日志设置
mkdir -p logs
base=$1
# 参数设置,运行训练程序
python -u $ROOT/tools/train_siammask_refine.py \
--config=config.json -b 8\
-j 0 --pretrained $base \
--epochs 8 \
2>&1 | tee logs/train.log
训练过程中损失函数的变化如下所示:
分类的损失:
定位的损失:
分割的损失:
因为掩膜细化模块我们主要是加在分割的网络上,所以在分割支路中,我们可以看到损失函数是不断减小的,而定位和分类的损失并不会明显减小。
总结:
- 网络训练过程中要指明训练数据,模型,优化方法,并将训练好的模型进行保存
- siammask模型的训练分为两部分:
- 一部分是base网络的训练,该网络不包含掩膜细化模块,
- 一部分是在训练好的base网络上进行训练,主要是对掩膜细化模块进行训练。