问题讨论
我们知道,nn.CrossEntropyLoss()函数在计算损失值时已经整合了Softmax,即在计算Loss时它自动将神经网络的输出进行了Softmax运算后才进行运算的。也就是说,使用nn.CrossEntropyLoss()函数计算网络的损失值时网络的输出不需要再经过Softmax。以笔者搭建过的一个CNN网络为例,网络代码如下:
class CNN(nn.Module): #n=101*1201 (n+2p-f)/s+1
def __init__(self):
super(hrrp_CNN, self).__init__()
self.cnn=nn.Sequential(
#Layer1
nn.Conv2d(in_channels=1, out_channels=8, kernel_size=4, stride=1, padding=0), #n=98*1198
nn.BatchNorm2d(8),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2), #49*599
#Layer2
nn.Conv2d(in_channels=8, out_channels=16, kernel_size=4, stride=1, padding=0), #46*596
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2), #23*298
#Layer3
nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=0), #21*296
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2), # 10*148
#Layer4
nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=0), # 8*146
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2), #4*73
#Layer5
nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=0), # 2*71
nn.BatchNorm2d(128),
nn.ReLU(),
nn.AdaptiveAvgPool2d((1,1)) #1*1
)
self.linear=nn.Sequential(
nn.Linear(128*1*1,16),
nn.Dropout2d(0.2),
nn.Linear(16,4)
)
self.softmax=nn.Softmax()
def forward(self, x):
x=self.cnn(x)
x = x.view(x.shape[0], -1) # 矩阵的每一行就是这个批量中每张图片的各个参数,即矩阵中一行对应一张图片
x=self.linear(x)
# x=self.softmax(x)
return x
model=hrrp_CNN()
model.to(device)
loss_fn = nn.CrossEntropyLoss()
loss_fn.to(device)
learning_rate = 0.005
optimizer=torch.optim.SGD(params=model.parameters(),lr=learning_rate,momentum=0.9,weight_decay=0.0001)
optimizer=torch.optim.Adam(params=model.parameters(),lr=learning_rate) # weight_decay越大,会进行惩罚,降低变化的速率,一般优化器中的参数都为默认值
scheduler=lr_scheduler.ExponentialLR(optimizer,gamma=0.97) #学习率衰减(指数衰减),gamma不要设置的太小
这样在训练和测试时计算出的Loss值就是准确的了。但随之而来一个新的问题:在计算识别准确率时,是将网络输出的one-hot向量的argmax的值与数据集的标签(Label)作比较,但由于之前已经去掉了网络输出端的Softmax,会导致网络的实际输出向量并没有通过Softmax。
解决方法
我们可以在网络外单独进行Softmax操作,并在训练和测试的循环中,在计算完Loss后对网络的输出outputs再单独进行一次Softmax运算,具体代码如下:
class CNN(nn.Module): #n=101*1201 (n+2p-f)/s+1
def __init__(self):
super(hrrp_CNN, self).__init__()
self.cnn=nn.Sequential(
#Layer1
nn.Conv2d(in_channels=1, out_channels=8, kernel_size=4, stride=1, padding=0), #n=98*1198
nn.BatchNorm2d(8),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2), #49*599
#Layer2
nn.Conv2d(in_channels=8, out_channels=16, kernel_size=4, stride=1, padding=0), #46*596
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2), #23*298
#Layer3
nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=0), #21*296
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2), # 10*148
#Layer4
nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=0), # 8*146
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2), #4*73
#Layer5
nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=0), # 2*71
nn.BatchNorm2d(128),
nn.ReLU(),
nn.AdaptiveAvgPool2d((1,1)) #1*1
)
self.linear=nn.Sequential(
nn.Linear(128*1*1,16),
nn.Dropout2d(0.2),
nn.Linear(16,4)
)
self.softmax=nn.Softmax()
def forward(self, x):
x=self.cnn(x)
x = x.view(x.shape[0], -1) # 矩阵的每一行就是这个批量中每张图片的各个参数,即矩阵中一行对应一张图片
x=self.linear(x)
# x=self.softmax(x)
return x
softmax=nn.Softmax() #在计算准确率时还是要加上Softmax的
model=hrrp_CNN()
model.to(device)
loss_fn = nn.CrossEntropyLoss()
loss_fn.to(device)
learning_rate = 0.005
optimizer=torch.optim.SGD(params=model.parameters(),lr=learning_rate,momentum=0.9,weight_decay=0.0001)
optimizer=torch.optim.Adam(params=model.parameters(),lr=learning_rate) # weight_decay越大,会进行惩罚,降低变化的速率,一般优化器中的参数都为默认值
scheduler=lr_scheduler.ExponentialLR(optimizer,gamma=0.97) #学习率衰减(指数衰减),gamma不要设置的太小
train_acc_list = []
train_loss_list = []
val_acc_list = []
val_loss_list = []
epochs = 50
#开始训练+测试
for epoch in range(epochs):
print("-----第{}轮训练开始------".format(epoch + 1))
train_loss = 0.0
val_loss= 0.0
train_sum, train_cor, val_sum, val_cor= 0, 0, 0, 0
# arr=np.zeros((1,3630))
val_pred=[]
# 训练步骤开始
model.train()
for batch_idx, data in enumerate(train):
inputs, labels = data
inputs, labels=inputs.to(device),labels.to(device)
labels = torch.tensor(labels, dtype=torch.long) # 需要将label转换成long类型
optimizer.zero_grad()
outputs = model(inputs.float()) # 需要加.float(),否则会报错
loss = loss_fn(outputs, labels)
loss.backward()
optimizer.step()
# 计算每轮训练集的Loss
train_loss += loss.item()
# 计算每轮训练集的准确度
outputs = softmax(outputs) #计算准确率时需要添加softmax
_, predicted = torch.max(outputs.data, 1) # 选择最大的(概率)值所在的列数就是他所对应的类别数,
train_cor += (predicted == labels).sum().item() # 正确分类个数
train_sum += labels.size(0) # train_sum+=predicted.shape[0]
scheduler.step()
# train_true.append(labels.cpu().numpy()) #想把CUDA tensor格式的数据改成numpy时,需要先将其转换成cpu float-tensor随后再转到numpy格式。 numpy不能读取CUDA tensor 需要先将它转化为 CPU tensor
# train_pred.append(predicted.cpu().numpy())
# 预测步骤开始
model.eval()
with torch.no_grad():
for batch_idx1, data in enumerate(val):
inputs, labels = data
inputs, labels = inputs.to(device), labels.to(device)
labels = torch.tensor(labels, dtype=torch.long) # 需要将label转换成long类型
outputs = model(inputs.float()) # 需要加.float(),否则会报错
loss = loss_fn(outputs, labels)
# 计算每轮训练集的Loss
val_loss += loss.item()
# 计算每轮训练集的准确度
outputs = softmax(outputs)
_, predicted = torch.max(outputs.data, 1) # 选择最大的(概率)值所在的列数就是他所对应的类别数,
val_cor += (predicted == labels).sum().item() # 正确分类个数
val_sum += labels.size(0) # train_sum+=predicted.shape[0]
val_pred = val_pred + list(predicted.cpu().numpy())
print("Train loss:{} Train accuracy:{}% Val loss:{} Val accuracy:{}%".format(train_loss / batch_idx,
100 * train_cor / train_sum,
val_loss / batch_idx1,
100 * val_cor / val_sum))
train_loss_list.append(train_loss / batch_idx)
train_acc_list.append(100 * train_cor / train_sum)
val_acc_list.append(100 * val_cor / val_sum)
val_loss_list.append(val_loss / batch_idx1)
这样,在使用nn.CrossEntropyLoss()和Softmax分类器时计算的损失值和准确率便都是准确的了。