1. 网络模型
2.不同层设置不同学习率
这里以adam优化器为例
#不同层设置不同的学习率
train_params = list(map(id,crnn.rnn.parameters())) #20
rest_params = filter(lambda x:id(x) not in train_params, crnn.parameters()) #20
#print('22222222222222222222:', len(list(rest_params)))
# loss averager
loss_avg = utils.averager() #对loss取平均对象
# setup optimizer
if opt.adam:
#对不同层设置不同学习率
#optimizer = optim.Adam(crnn.parameters(), lr=opt.lr,
# betas=(opt.beta1, 0.999))
#optimizer = torch.nn.DataParallel(optimizer, device_ids=range(opt.ngpu))
### weight_decay防止过拟合的参数
optimizer = optim.Adam([{'params':crnn.rnn[0].rnn.parameters(),'lr':0.0000001,'betas':(0.5,0.999)},
{'params':crnn.rnn[0].embedding.parameters(),'lr':0.0000001,'betas':(0.5,0.999)},
{'params':crnn.rnn[1].rnn.parameters(),'lr':0.0000001,'betas':(0.5,0.999)},
{'params':crnn.rnn[1].embedding.parameters(),'lr':opt.lr,'betas':(0.5,0.999)},
{'params':rest_params,'lr':opt.lr,'betas':(0.5,0.999)}
])
3. 训练过程更新学习率
上述我们只对rnn网络进行训练,并对最后一层进行lr的更新。
def adjust_learning_rate(optimizer, epoch):
"""Sets the learning rate to the initial LR decayed by 10 every 5 epochs"""
lr = opt.lr * (0.1 ** (epoch // 5))
#for param_group in optimizer.param_groups: #每一层的学习率都会下降
optimizer.param_groups[3]['lr'] = lr
for epoch in range(opt.nepoch):
######每5个epoch修改一次学习率(只修改最后一个全连接层)
adjust_learning_rate(optimizer, epoch)
4. cnn层冻结
for p in crnn.named_parameters():
p[1].requires_grad = True
if 'rnn' in p[0]: #训练rnn层 #训练最后一层 #rnn rnn.1.embedding
p[1].requires_grad = True
else:
p[1].requires_grad = False##冻结模型层
crnn.train()