由于网络开始训练的时候不稳定,直接设置很小的学习率会出问题,使用了warm_up和余弦退火策略。先在10个epoch内让学习率由0直线上升至0.001,然后在90个epoch内让学习率降低至接近0。
代码如下:
scheduler=CosineAnnealingLR(optimizer, T_max=90)#余弦退火,以余弦的方式下降学习率
epochs = 100
best_acc = 0.0
writer = SummaryWriter(log_dir = 'logs')
init_img=torch.zeros([1, 4, 224, 224, 128], device=device)
writer.add_graph(net.to(device), init_img)#绘制模型图
globalstep=0
lr_scale=0
for epoch in range(epochs):
# train
net.train()#只有在net.train()模式会调用dropout的方法
running_loss = 0.0
train_acc=0.0
if globalstep<=10 :
optimizer.param_groups[0]['lr'] =initial_lr * lr_scale
lr_scale = min(1., float(globalstep + 1) / (10))
else:
scheduler.step()#更新学习率,这样更新的是当前学习率而不是初始的
globalstep+=1
for step, data in enumerate(train_loader):
images, _ , _ , train_labels, _ = data
if torch.isnan(torch.mean(images)) :
print(str(step) + 'th batch is nan')
continue#torch.isnan()是用来判断输入的张量是否为空的函数,当输入为空是,返回True,continue会跳出本次循环
optimizer.zero_grad()#如果每次不对梯度进行清零,梯度会累加
train_outputs = net(images.to(device))#正向传播
print("output:{}".format(train_outputs.item()))
print("label:{}".format(train_labels.item()))
loss = loss_function(train_outputs, train_labels.to(device))
#CUDA_LAUNCH_BLOCKING=1
loss.backward()
optimizer.step()#更新所有的参数
print("loss:{}".format(loss.item()))#.item()是把数值从张量里拿出来变成一个标量
# print statistics
running_loss += loss.item()#如果不用.item()会变成计算图,而不是数值
train_predict_y = (train_outputs > 0.5).type(torch.float32)
train_acc += torch.eq(train_predict_y, train_labels.to(device)).sum().item()
#print("train_acc: {:.3f}".format(train_acc))
train_accurate = train_acc / train_num
#print("lr:{}".format(optimizer.param_groups[0]['lr']))
最后得到的学习率曲线如图:
训练也得到较好结果。