if epoch % p['epoch_size'] == p['epoch_size'] - 1:
lr_ = utils.lr_poly(base_lr=p['lr'], iter_=epoch, max_iter=nEpochs, power=0.9)
print('(poly lr policy) learning rate: ', lr_)
optimizer = optim.SGD(net.parameters(), lr=lr_, momentum=p['momentum'], weight_decay=p['wd'])
epoch从0到99,每次到9,19,29,...也就是说每10次epoch就更改学习率见其应用到optim中,学习率的策略采用poly,其他策略可以在我的收藏里面查找
net.train() 进行模型的训练
inputs, labels = sample_batched['image'], sample_batched['label'] 这里的都是tensor,6x3x512x512和6x1x512x512,因为tensor不能求梯度,所以必须转化成Variable: inputs, labels = Variable(inputs, requires_grad=True), Variable(labels),那为什么lables不需要求梯度也需要转化,原因在于lables参与了对inputs求梯度的运算,它必须也是一样的类型否则会报错.在这里对于Variable的学习可以参见官方文档
inputs, labels = inputs.cuda(), labels.cuda() 将数据放在gpu0号上面,到目前为止,模型和数据都放在gpu上面了
outputs = net.forward(inputs) 将数据传入模型里面
loss = criterion(outputs, labels, size_average=False, batch_average=True)#计算loss,采用CrossEntropyLoss函数.
这个函数的定义如下,而且关于nn.CrossEntropyLoss可见https://mp.csdn.net/postedit/83041355
from dataloaders import utils
criterion = utils.cross_entropy2d
import torch.nn as nn
def cross_entropy2d(logit, target, ignore_index=255, weight=None, size_average=True, batch_average=True):
n, c, h, w = logit.size()
# logit = logit.permute(0, 2, 3, 1)
target = target.squeeze(1)
if weight is None:
criterion = nn.CrossEntropyLoss(weight=weight, ignore_index=ignore_index, size_average=False)
else:
criterion = nn.CrossEntropyLoss(weight=torch.from_numpy(np.array(weight)).float().cuda(), ignore_index=ignore_index, size_average=False)
loss = criterion(logit, target.long())
if size_average:
loss /= (h * w)
if batch_average:
loss /= n
return loss
下面接着主线代码
running_loss_tr += loss.item() #得到的是python的float类型的常数值,该数值不能求导
if ii % num_img_tr == (num_img_tr - 1): #num_img_tr=1764=train batch num
running_loss_tr = running_loss_tr / num_img_tr
writer.add_scalar('data/total_loss_epoch', running_loss_tr, epoch)
print('[Epoch: %d, numImages: %5d]' % (epoch, ii * p['trainBatch'] + inputs.data.shape[0]))
print('Loss: %f' % running_loss_tr)
running_loss_tr = 0
stop_time = timeit.default_timer()
print("Execution time: " + str(stop_time - start_time) + "\n")
上面这个判断在一定的条件下从主要输出端口打印出一些的参数信息.当历经一个epoch也就是1764个batch的时候(注意上面计数是从0开始的)打印输出第几代,该代的总图片数目;平均一个batch的loss数值是多少;执行该代总共需要多少时间.同时保存数据:第几代,该代的平均一个batch的loss数值是多少.这里的时间是以秒为单位的,但是习惯上我们喜欢时分秒的查看方式,所以可以将显示部分进行修改可以参考https://www.cnblogs.com/gayhub/p/6154707.html
# Backward the averaged gradient in every batch
loss /= p['nAveGrad']
loss.backward()
aveGrad += 1
前面的到的loss是一个可以求导的tensor,一般情况下是直接loss.backward()反向传播的,但是这里采用了Average the gradient of several iterations,只不过p['nAveGrad']=1.还有就是这个loss 是每一个batch的loss.也就是,每经过一次batch就会反向传导loss
# Update the weights once in p['nAveGrad'] forward passes
if aveGrad % p['nAveGrad'] == 0:
writer.add_scalar('data/total_loss_iter', loss.item(), ii + num_img_tr * epoch)
optimizer.step()
optimizer.zero_grad()
aveGrad = 0
上面这个也是核心代码.loss.backward()进行了求梯度运算得到梯度,optimizer.step()是进行模型参数的更新,更新完之后梯度要清空optimizer.zero_grad(),如此反复将模型进行优化.同时保存数据:第几次迭代,该迭代的loss数值是多少.注意这里的迭代不是epoch而是batch,一个batch就是迭代一次.参数p['nAveGrad']的作用是用来控制迭代多少次就更新一次模型参数