上图是论文的网络的结构图,包括5个卷积层和3个全连接层,作者还特别强调,depth的重要性,少一层结果就会变差,所以这种超参数的调节可真是不简单.
激活函数
首先讨论的是激活函数,作者选择的不是\(f(x)=\mathrm{tanh}(x)=(1+e^{-x})^{-1}\),而是ReLUs ( Rectified Linear Units)——\(f(x)=\max (0, x)\), 当然,作者考虑的问题是比赛的那个数据集,其网络的收敛速度为:
接下来,作者讨论了标准化的问题,说ReLUs是不需要进行这一步的,论文中的那句话我感觉理解的怪怪的:
ReLUs have the desirable property that they do not require input normalization to prevent them fromsaturating.
饱和?
作者说,也可以对ReLUs进行扩展,使得其更有泛化性,把多个核进行标准化处理:
\(i\)表示核的顺序,\(a_{x,y}^i\)则是其值, 说实话,这部分也没怎么弄懂.
然后是关于池化层的部分,一般的池化层的核是不用重叠的,作者这部分也考虑进去了.
防止过拟合
为了防止过拟合,作者提出了他的几点经验.
增加数据
这个数据不是简单的多找点数据,而是通过一些变换使得数据增加.
比如对图片进行旋转,以及PCA提主成分,改变score等.
Dropout
多个模型,进行综合评价是防止过拟合的好方法,但是训练网络不易,dropout, 即让隐层的神经元以一定的概率输出为0来,所以每一次训练,网络的结构实际上都是不一样的,但是整个网络是共享参数的,所以可以一次性训练多个模型?
细节
batch size: 128
momentum: 0.9
weight decay: 0.0005
一般的随机梯度下降好像是没有weight decay这一部分的,但是作者说,实验中这个的选择还是蛮有效的.
代码
很遗憾的是,没有成功,我不知道是哪里出了问题, loss一直没有下降,如果有大佬能找到病因,务必告诉我.
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
root = "C:/Users/pkavs/1jupiterdata/data"
#准备训练集
trainset = torchvision.datasets.CIFAR10(root=root, train=True,
download=False,
transform=transforms.Compose(
[transforms.Resize(227),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
))
train_loader = torch.utils.data.DataLoader(trainset, batch_size=64,
shuffle=True, num_workers=0)
testset = torchvision.datasets.CIFAR10(root=root, train=False,
download=False,
transform=transforms.Compose(
[transforms.Resize(227),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
))
test_loader = torch.utils.data.DataLoader(testset, batch_size=64,
shuffle=False, num_workers=0)
# 创建网络 .......................
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Sequential( #3 x 227 x 227
nn.Conv2d(3, 96, 11, 4, 0), #3通道 输出96通道 卷积核为11 x 11 滑动为4 不补零
nn.ReLU()
)
self.conv2 = nn.Sequential( # 96 x 55 x 55
nn.Conv2d(48, 128, 5, 1, 2),
nn.ReLU(),
nn.MaxPool2d(3, 2)
)
self.conv3 = nn.Sequential( #256 x 27 x 27
nn.Conv2d(256, 192, 3, 1, 1),
nn.ReLU(),
nn.MaxPool2d(3, 2)
)
self.conv4 = nn.Sequential( #384 x 13 x 13
nn.Conv2d(192, 192, 3, 1, 1),
nn.ReLU()
)
self.conv5 = nn.Sequential( #384 x 13 x 13
nn.Conv2d(192, 128, 3, 1, 1),
nn.ReLU(),
nn.MaxPool2d(3, 2)
)
self.dense = nn.Sequential(
nn.Linear(9216, 4096),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(4096, 4096),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(4096, 10)
)
def forward(self, input):
x = self.conv1(input)
x1, x2 = x[:, :48, :, :], x[:, 48:, :, :] #拆分
x1 = self.conv2(x1)
x2 = self.conv2(x2)
x = torch.cat((x1, x2), 1) #合并
x1 = self.conv3(x)
x2 = self.conv3(x)
x1 = self.conv4(x1)
x2 = self.conv4(x2)
x1 = self.conv5(x1)
x2 = self.conv5(x2)
x = torch.cat((x1, x2), 1)
x = x.view(-1, 9216)
output = self.dense(x)
return output
"""
print("Start Training...")
net = Net()
criterion = nn.CrossEntropyLoss() #损失函数
lr = 0.01 #学习率
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
for epoch in range(5):
running_loss = 0.0
for i, data in enumerate(train_loader, 0):
inputs, labels = data
outputs = net(inputs)
loss = criterion(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item() #loss.item()转化为标量
if i % 10 == 9:
print("[{epoch}, {i:>5d}] loss: {loss:>.3f}".format(
epoch=epoch,
i=i + 1,
loss=running_loss / 10
))
running_loss = 0.0
torch.save(net.state_dict(), root + "/alexnet.pt")
print('Finished Training')
"""
net = Net()
net.load_state_dict(torch.load(root + "/alexnet.pt"))
net.eval()
correct = 0
total = 0
with torch.no_grad():
try:
for data in test_loader:
images, labels = data
outputs = net(images)
_, pred = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (pred == labels).sum().item()
except FileNotFoundError:
pass
finally:
print(total)
print('Accuracy of the network on the 10000 test images: %d %%' % (
100 * correct / total))
调整了网络后可以下降了:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
root = "C:/Users/pkavs/1jupiterdata/data"
#准备训练集
trainset = torchvision.datasets.CIFAR10(root=root, train=True,
download=False,
transform=transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
))
train_loader = torch.utils.data.DataLoader(trainset, batch_size=64,
shuffle=True, num_workers=0)
testset = torchvision.datasets.CIFAR10(root=root, train=False,
download=False,
transform=transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
))
test_loader = torch.utils.data.DataLoader(testset, batch_size=64,
shuffle=False, num_workers=0)
# 创建网络 .......................
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Sequential( #3 x 32 x 32 --> 96 x 15 x 15
nn.Conv2d(3, 96, 7, 4, 2),
nn.ReLU(),
nn.MaxPool2d(2, 2)
)
self.conv2 = nn.Sequential( # 96 x 15 x 15 --> 256 x 15 x 15
nn.Conv2d(48, 128, 5, 1, 2),
nn.ReLU(),
nn.MaxPool2d(2, 2) #256 x 15 x 15 --> 256 x 7 x 7
)
self.conv3 = nn.Sequential( #256 x 7 x 7 --> 384 x 7 x 7
nn.Conv2d(256, 192, 3, 1, 1),
nn.ReLU()
)
self.conv4 = nn.Sequential( #384 x 3 x 3 --> 384 x 3 x 3
nn.Conv2d(192, 192, 3, 1, 1),
nn.ReLU()
)
self.conv5 = nn.Sequential( #384 x 3 x 3 --> 256 x 3 x 3
nn.Conv2d(192, 128, 3, 1, 1),
nn.ReLU(),
nn.MaxPool2d(2, 2)
)
self.dense = nn.Sequential(
nn.Linear(256, 128),
nn.ReLU(),
nn.Dropout(),
nn.Linear(128, 10)
)
def forward(self, input):
x = self.conv1(input)
x1, x2 = x[:, :48, :, :], x[:, 48:, :, :] #拆分
x1 = self.conv2(x1)
x2 = self.conv2(x2)
x = torch.cat((x1, x2), 1) #合并
x1 = self.conv3(x)
x2 = self.conv3(x)
x1 = self.conv4(x1)
x2 = self.conv4(x2)
x1 = self.conv5(x1)
x2 = self.conv5(x2)
x = torch.cat((x1, x2), 1)
x = x.view(-1, 256)
output = self.dense(x)
return output
"""
print("Start Training...")
net = Net()
criterion = nn.CrossEntropyLoss() #损失函数
lr = 0.14 #学习率
optimizer = torch.optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=0.0005)
for epoch in range(7):
if epoch == 3:
lr = 0.01
optimizer = torch.optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=0.0005)
running_loss = 0.0
for i, data in enumerate(train_loader, 0):
inputs, labels = data
optimizer.zero_grad()
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
if i % 10 == 9:
print("[{epoch}, {i:>5d}] loss: {loss:>.3f}".format(
epoch=epoch,
i=i + 1,
loss=running_loss / 10
))
running_loss = 0.0
torch.save(net.state_dict(), root + "/alexnet.pt")
if i % 100 == 99 and lr > 0.1:
lr = lr / 5
optimizer = torch.optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=0.0005)
print('Finished Training')
"""
net = Net()
net.load_state_dict(torch.load(root + "/alexnet.pt"))
net.eval()
correct = 0
total = 0
with torch.no_grad():
try:
for data in test_loader:
images, labels = data
outputs = net(images)
_, pred = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (pred == labels).sum().item()
except FileNotFoundError:
pass
finally:
print(total)
print('Accuracy of the network on the 10000 test images: %d %%' % (
100 * correct / total))
一方面之前4096-->10可能是有点问题,另一方面,学习率很重要,一开始学习率低于0.1或者高于0.16损失都不会下降, 最后损失降为0.9左右,正确率为0.63, 调学习率应该会有更好的性能.