这一部分主要是添加卷积等操作后模型的训练
1 组装完整的训练循环
convnet的核心是两个循环的嵌套:外部循环是迭代周期,内部循环是DataLoader批次生成数据集。
每个循环中都包含:
通过模型提供输入(正向传播)
计算损失(正向传播的一部分)
将任何老的梯度归零
调用loss.backward()来计算损失相对所有参数的梯度(反向传播)
调节优化器
1.1 组装并训练
##训练循环
import datetime
epoch_list = []
loss_list = []
def training_loop(n_epochs, optimizer, model, loss_fn, train_loader):
for epoch in range(1, n_epochs+1):
loss_train = 0.0
epoch_list.append(epoch)
for imgs, labels in train_loader:
outputs = model(imgs)
loss = loss_fn(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss_train += loss.item()
if epoch == 1 or epoch%10 == 0:
print('{} Epoch {}, Training loss {}'.format(datetime.datetime.now(), epoch, loss_train/len(train_loader)))
loss_list.append(loss_train/len(train_loader))
'''
这里除以训练数据加载器的长度,得到的是一批的平均损失
'''
加载数据并设置训练参数:
###加载数据设置迭代
train_loader = torch.utils.data.DataLoader(cifar2, batch_size=64,shuffle=True)
model = Net()
optimizer = optim.SGD(model.parameters(), lr=1e-2)
loss_fn = nn.CrossEntropyLoss()
training_loop(
n_epochs = 100,
optimizer = optimizer,
model = model,
loss_fn = loss_fn,
train_loader = train_loader)
输出:
验证一下准确率:
训练集:
train_loader = torch.utils.data.DataLoader(cifar2, batch_size=64, shuffle=False)#注意参数的设置
correct = 0
total = 0
acc_list = []
with torch.no_grad():
for imgs, labels in train_loader:
outputs = model(imgs)
_, predicted = torch.max(outputs, dim=1)
total += labels.shape[0]
correct += int((predicted == labels).sum())
accuracy = correct/total
acc_list.append(accuracy)
print("Accuracy train: %f" % (accuracy))
plt.plot(acc_list)
plt.xlabel('epoch')
plt.ylabel('acc')
plt.title("acc curve")
# plt.ylim([0,1])
plt.show()
plt.plot(loss_list)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.title("loss curve")
plt.show()
输出:
验证集:
##验证模型验证集上准确率:
val_loader = torch.utils.data.DataLoader(cifar2_val, batch_size=64, shuffle=False)#注意参数的设置
correct1 = 0
total1 = 0
acc_list = []
with torch.no_grad():
for imgs, labels in val_loader:
outputs = model(imgs)
_, predicted = torch.max(outputs, dim=1)
total += labels.shape[0]
correct += int((predicted == labels).sum())
accuracy = correct/total
acc_list.append(accuracy)
print("Accuracy val: %f" % (accuracy))
plt.plot(acc_list)
plt.title("acc curve")
plt.xlabel('epoch')
plt.ylabel('acc')
plt.ylim([0,1])
plt.show()
输出:效果很不错
1.2 保存并加载设计的模型
##保存并加载我们的模型
data_path1 = 'D:\\DeepLearning data\\data\\p1ch8\\'
torch.save(model.state_dict(), data_path1 + 'bird_vs_airplanes.pt')
##创建实例后将以训练好保存的参数加载到模型实例中:
load_model = Net()
load_model.load_state_dict(torch.load(data_path1 + 'bird_vs_airplanes.pt'))
查看文件夹:
保存成功了。
1.3 在GPU上训练
只需要添加两条语句加少量语句修改:
##在GPU上训练
device = (torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))
print(f"Training on device {device}.")
import datetime
epoch_list = []
loss_list = []
def training_loop(n_epochs, optimizer, model, loss_fn, train_loader):
for epoch in range(1, n_epochs+1):
loss_train = 0.0
epoch_list.append(epoch)
for imgs, labels in train_loader:
outputs = model(imgs)
loss = loss_fn(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss_train += loss.item()
loss_list.append(loss.item())
if epoch == 1 or epoch%10 == 0:
print('{} Epoch {}, Training loss {}'.format(datetime.datetime.now(), epoch, loss_train/len(train_loader)))
###加载数据设置迭代
train_loader = torch.utils.data.DataLoader(cifar2, batch_size=64,shuffle=True)
model = Net().to(device=device)##别忘记将模型(所有参数)转移到GPU之上,否则会报错。因为pytorch操作符不支持GPU和CPU的混合输入
optimizer = optim.SGD(model.parameters(), lr=1e-2)
loss_fn = nn.CrossEntropyLoss()
training_loop(
n_epochs = 100,
optimizer = optimizer,
model = model,
loss_fn = loss_fn,
train_loader = train_loader)
##查看准确率:
train_loader = torch.utils.data.DataLoader(cifar2, batch_size=64, shuffle=False)
val_loader = torch.utils.data.DataLoader(cifar2_val, batch_size=64, shuffle=False)
def validate(model, train_loader, val_loader):
for name, loader in [("train",train_loader),("val",val_loader)]:
correct = 0
total = 0
with torch.no_grad():
for imgs, labels in loader:
outputs = model(imgs)
_, predicted = torch.max(outputs, dim=1)
total += labels.shape[0]
correct += int((predicted == labels).sum())
accuracy = correct/total
acc_list.append(accuracy)
print("Accuracy {} : {:.2f}" .format(name,accuracy))
validate(model, train_loader, val_loader)
我这里还是使用的CPU训练,因此无法展示GPU的结果。
2 模型设计
2.1 增加内存容量:宽度
增加宽度也就是增加每一层的神经元个数,或每个卷积的通道数。
##模型设计
#增加内存容量:宽度 增加层数
class NetWidth(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
self.conv2 = nn.Conv2d(32, 16, kernel_size=3, padding=1)
self.fc1 = nn.Linear(16*8*8, 32)
self.fc2 = nn.Linear(32, 2)
def forward(self,x):
out = F.max_pool2d(torch.tanh(self.conv1(x)),2)
out = F.max_pool2d(torch.tanh(self.conv2(out)),2)
out = out.view(-1,16*8*8)
out = torch.tanh(self.fc1(out))
out = self.fc2(out)
return out
实例化之后查看一下参数的数量:
model = NetWidth()
sum(p.numel() for p in model.parameters())
如果想避免在模型定义中硬编码数字,可以向__init__()传递一个参数并参数化宽度:
##避免在模型中硬编码数字(个人觉得硬编码会更直观,但也不一定,例如上一讲中随机剪裁的尺寸)
class NetWidth(nn.Module):
def __init__(self, n_chans1 = 32):
super().__init__()
self.n_chans1 = n_chans1
self.conv1 = nn.Conv2d(3, n_chans1, kernel_size=3, padding=1)
self.conv2 = nn.Conv2d(n_chans1, n_chans1//2, kernel_size=3, padding=1)
self.fc1 = nn.Linear(8*8*n_chans1//2, 32)
self.fc2 = nn.Linear(32, 2)
def forward(self,x):
out = F.max_pool2d(torch.tanh(self.conv1(x)),2)
out = F.max_pool2d(torch.tanh(self.conv2(out)),2)
out = out.view(-1,8*8*self.n_chans1//2)
out = torch.tanh(self.fc1(out))
out = self.fc2(out)
return out
验证参数个数:
2.2 使用正则化
def training_loop_12reg(n_epochs, optimizer, model, loss_fn, train_loader):
for epoch in range(1, n_epochs+1):
loss_train = 0.0
epoch_list.append(epoch)
for imgs, labels in train_loader:
outputs = model(imgs)
loss = loss_fn(outputs, labels)
l2_lambda = 0.001
l2_norm = sum(p.pow(2.0).sum() for p in model.parameters())
loss = loss + l2_lambda * l2_norm
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss_train += loss.item()
loss_list.append(loss.item())
if epoch == 1 or epoch%10 == 0:
print('{} Epoch {}, Training loss {}'.format(datetime.datetime.now(), epoch, loss_train/len(train_loader)))
2.3 Dropout
这一技术使得模型不依赖于单一输出,主要是将网络每轮训练迭代中的神经元随机部分清零。
class NetDropout(nn.Module):
def __init__(self, n_chans1=32):
super().__init__()
self.n_chans1 = n_chans1
self.conv1 = nn.Conv2d(3, n_chans1, kernel_size=3, padding=1)
self.conv1_dropout = nn.Dropout2d(p=0.4)
self.conv2 = nn.Conv2d(n_chans1, n_chans1//2, kernel_size=3, padding=1)
self.conv2_dropout = nn.Dropout2d(p=0.4)
self.fc1 = nn.Linear(8*8*n_chans1//2, 32)
self.fc2 = nn.Linear(32, 2)
def forward(self,x):
out = F.max_pool2d(torch.tanh(self.conv1(x)),2)
out = self.conv1_dropout(out)
out = F.max_pool2d(torch.tanh(self.conv2(out)),2)
out = self.conv2_dropout(out)
out = out.view(-1,8*8*self.n_chans1//2)
out = torch.tanh(self.fc1(out))
out = self.fc2(out)
return out
2.4 批量归一化
批量归一化背后的主要思想是将输入重新调整到网络的激活状态。从而使小批量具有一定的理想分布。其目的是重新调整激活的输入,因此BN操作在激活函数之后。
##批量归一化 保持激活检查
'''
批量归一化背后的主要思想是将输入重新调整到网络的激活状态。从而使小批量具有一定的理想分布。
其目的是重新调整激活的输入,因此BN操作在激活函数之后
'''
class NetBN(nn.Module):
def __init__(self, n_chans1=32):
super().__init__()
self.n_chans1 = n_chans1
self.conv1 = nn.Conv2d(3, n_chans1, kernel_size=3, padding=1)
self.conv1_batchnorm = nn.BatchNorm2d(num_features=n_chans1)
self.conv2 = nn.Conv2d(n_chans1, n_chans1//2, kernel_size=3, padding=1)
self.conv1_batchnorm = nn.BatchNorm2d(num_features=n_chans1//2)
self.fc1 = nn.Linear(8*8*n_chans1//2, 32)
self.fc2 = nn.Linear(32, 2)
def forward(self,x):
out = self.conv1_batchnorm(self.conv1(x))
out = F.max_pool2d(torch.tanh(out),2)
out = self.conv2_batchnorm(self.conv1(out))
out = F.max_pool2d(torch.tanh(out),2)
out = out.view(-1,8*8*self.n_chans1//2)
out = torch.tanh(self.fc1(out))
out = self.fc2(out)
return out
2.5 使用tips改进后的网络训练
###改进后的网络
class Net1(nn.Module):
def __init__(self, n_chans1=32):
super().__init__()
self.n_chans1 = n_chans1
self.conv1 = nn.Conv2d(3, n_chans1, kernel_size=3, padding=1)
self.conv1_dropout = nn.Dropout2d(p=0.4)
self.conv1_batchnorm = nn.BatchNorm2d(num_features=n_chans1)
self.conv2 = nn.Conv2d(n_chans1, n_chans1//2, kernel_size=3, padding=1)
self.conv2_dropout = nn.Dropout2d(p=0.4)
self.conv2_batchnorm = nn.BatchNorm2d(num_features=n_chans1//2)
self.fc1 = nn.Linear(8*8*n_chans1//2, 32)
self.fc2 = nn.Linear(32, 2)
def forward(self,x):
out = self.conv1_batchnorm(self.conv1_dropout(self.conv1(x)))
out = F.max_pool2d(torch.tanh(out),2)
out = self.conv2_batchnorm(self.conv2_dropout(self.conv2(out)))
out = F.max_pool2d(torch.tanh(out),2)
out = out.view(-1,8*8*self.n_chans1//2)
out = torch.tanh(self.fc1(out))
out = self.fc2(out)
return out
训练结果:
效果反倒下降了,或许是网络层数太浅?
3 更复杂的结构
3.1 跳跃连接
跳跃连接只是将输入添加到层块之间的一个输出之中:
一个更改了激活函数的NetDepth():
class NetDepth(nn.Module):
def __init__(self, n_chans1=32):
super().__init__()
self.n_chans1 = n_chans1
self.conv1 = nn.Conv2d(3, n_chans1, kernel_size=3, padding=1)
self.conv2 = nn.Conv2d(n_chans1, n_chans1//2, kernel_size=3, padding=1)
self.conv3 = nn.Conv2d(n_chans1//2, n_chans1//2, kernel_size=3, padding=1)
self.fc1 = nn.Linear(4*4*n_chans1//2, 32)
self.fc2 = nn.Linear(32, 2)
def forward(self,x):
out = F.max_pool2d(torch.relu(self.conv1(x)),2)
out = F.max_pool2d(torch.relu(self.conv2(out)),2)
out = F.max_pool2d(torch.relu(self.conv3(out)),2)
out = out.view(-1,4*4*self.n_chans1//2)
out = torch.relu(self.fc1(out))
out = self.fc2(out)
return out
仿照ResNet向这个模型添加一个跳跃连接:
#仿照ResNet向之前的NetDepth中添加一个跳跃连接:
class NetRes(nn.Module):
def __init__(self, n_chans1=32):
super().__init__()
self.n_chans1 = n_chans1
self.conv1 = nn.Conv2d(3, n_chans1, kernel_size=3, padding=1)
self.conv2 = nn.Conv2d(n_chans1, n_chans1//2, kernel_size=3, padding=1)
self.conv3 = nn.Conv2d(n_chans1//2, n_chans1//2, kernel_size=3, padding=1)
self.fc1 = nn.Linear(4*4*n_chans1//2, 32)
self.fc2 = nn.Linear(32, 2)
def forward(self,x):
out = F.max_pool2d(torch.relu(self.conv1(x)),2)
out = F.max_pool2d(torch.relu(self.conv2(out)),2)
out1 = out
out = F.max_pool2d(torch.relu(self.conv3(out))+out1,2)
out = out.view(-1,4*4*self.n_chans1//2)
out = torch.relu(self.fc1(out))
out = self.fc2(out)
return out
使用该网络训练模型:
结果:
训练集提高了一点。
3.2 使用pytorch建立非常深的网络
标准的策略是定义一个构建块 例如一个Conv2d、Relu再加跳跃连接块,然后在for循环中动态构建网络。
首先创建一个模块子类,其任务是为一个块提供计算,该快包含一组卷积、激活函数和跳跃连接:
class ResBlock(nn.Module):
def __init__(self,n_chans):
super(ResBlock, self).__init__()
self.conv = nn.Conv2d(n_chans, n_chans, kernel_size=3, padding=1, bias=False)
self.batch_norm = nn.BatchNorm2d(num_features=n_chans)
torch.nn.init.kaiming_normal_(self.conv.weight,nonlinearity='relu')
'''
根据He, K等人在“Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification”中描述的方法。
用一个正态分布生成值,填充输入的张量或变量。结果张量中的值采样自均值为0,标准差为sqrt(2/((1 + a^2) * fan_in))的正态分布。
Kaiming的特点是:正向传播时,状态值的方差保持不变;反向传播时,关于激活值的梯度的方差保持不变。
'''
torch.nn.init.constant_(self.batch_norm.weight, 0.5)
torch.nn.init.zeros_(self.batch_norm.bias)
def forward(self,x):
out = self.conv(x)
out = self.batch_norm(out)
out = torch.relu(out)
return out+x
生成一个具有100个块的网络:
class NetResDeep(nn.Module):
def __init__(self, n_chans1=32, n_blocks=10):
super().__init__()
self.n_chans1 = n_chans1
self.conv = nn.Conv2d(3, n_chans1, kernel_size=3, padding=1)
self.resblocks = nn.Sequential(*(n_blocks * [ResBlock(n_chans=n_chans1)]))
self.fc1 = nn.Linear(8*8*n_chans1,32)
self.fc2 = nn.Linear(32,2)
def forward(self,x):
out = F.max_pool2d(torch.tanh(self.conv1(x)),2)
out = self.resblocks(out)
out = F.max_pool2d(out,2)
out = out.view(-1,8*8*self.n_chans1)
out = torch.relu(self.fc1(out))
out = self.fc2(out)
return out
实例化并查看:
model = NetResDeep()
print(model)
输出:
这一部分很长了,就留到下一部分,主要是练习题和一些其他需要说明的~