现代卷积神经网络
深度卷积神经网络(AlexNet)
试着增加迭代轮数。对比LeNet的结果有什么不同?为什么?
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
device = torch.device('cuda:0')
train_set = torchvision.datasets.FashionMNIST(
root='./dataMnist'
,train=True
,download=True
,transform=transforms.Compose([
transforms.ToTensor(),
transforms.Resize((224, 224))
])
)
train_loader = torch.utils.data.DataLoader(
train_set,
batch_size=32,
shuffle=True,
num_workers = 0
)
test_set = torchvision.datasets.FashionMNIST(
root='./dataMnist'
,train=False
,download=True
,transform=transforms.Compose([
transforms.ToTensor(),
transforms.Resize((224, 224))
])
)
test_loader = torch.utils.data.DataLoader(
test_set,
batch_size=32,
shuffle=True,
num_workers = 0
)
#Flatten拉平
net = nn.Sequential(nn.Conv2d(in_channels=1, out_channels=96, kernel_size=11, stride=4, padding=1), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2), #54*54 --26
nn.Conv2d(96, 256, kernel_size=5, padding=2), nn.ReLU(),#26*26
nn.MaxPool2d(kernel_size=3, stride=2),#12*12
nn.Conv2d(256, 384,kernel_size=3, padding=1), nn.ReLU(),
nn.Conv2d(384, 384,kernel_size=3, padding=1), nn.ReLU(),
nn.Conv2d(384, 384,kernel_size=3, padding=1), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Flatten(),
nn.Linear(9600, 4096), nn.ReLU(),nn.Dropout(p=0.5),
nn.Linear(4096, 4096), nn.ReLU(),nn.Dropout(p=0.5),
nn.Linear(4096, 10))
# x = torch.zeros((1, 3, 224, 224))
# for layer in net:
# x = layer(x)
# print(layer.__class__.__name__, "\t ouput_size:", x.shape)
# #交叉熵损失函数中传递未规范化的预测,并同时计算softmax及其对数
# #参数的初始化
def init_weights(m):
if(type(m))==nn.Linear:
nn.init.normal_(m.weight, std=0.01)
net.apply(init_weights)
optimizer = optim.SGD(net.parameters(), lr = 0.12)
loss = nn.CrossEntropyLoss(reduction='mean')
epoch = 10
losses = []
for i in range(epoch):
loss_sum = 0
for x, y in train_loader:
net = net.to(device)
x = x.to(device)
y = y.to(device)
y_hat = net(x)
loss_temp = loss(y_hat, y)
loss_sum += loss_temp
optimizer.zero_grad()
loss_temp.backward()
optimizer.step()
losses.append(loss_sum.cpu().detach().numpy()/train_set.data.shape[0])
print("epoch: ",i, "loss=", loss_sum.item())
acc = 0
with torch.no_grad():
for x, y in test_loader:
x = x.to(device)
y = y.to(device)
y_hat = net(x)
acc += torch.sum(y_hat.argmax(dim=1).type(y.dtype) == y)
print("测试集准确度",(acc/test_set.data.shape[0]).item())
plt.plot(range(epoch), losses)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
‘’’
epoch: 0 loss= 1712.237548828125
epoch: 1 loss= 659.957275390625
epoch: 2 loss= 551.9481201171875
epoch: 3 loss= 484.9080810546875
epoch: 4 loss= 433.572265625
epoch: 5 loss= 393.9735107421875
epoch: 6 loss= 361.9253234863281
epoch: 7 loss= 331.5981140136719
epoch: 8 loss= 302.447265625
epoch: 9 loss= 276.08563232421875
测试集准确度 tensor(0.9043, device=‘cuda:0’)
‘’’
比lenet效果更好,alexnet学习能力更强。
AlexNet对于Fashion-MNIST数据集来说可能太复杂了。尝试简化模型以加快训练速度,同时确保准确性不会显著下降。设计一个更好的模型,可以直接在 28×28 图像上工作。
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
device = torch.device('cuda:0')
train_set = torchvision.datasets.FashionMNIST(
root='./dataMnist'
,train=True
,download=True
,transform=transforms.Compose([
transforms.ToTensor(),
transforms.Resize((224, 224))
])
)
train_loader = torch.utils.data.DataLoader(
train_set,
batch_size=32,
shuffle=True,
num_workers = 0
)
test_set = torchvision.datasets.FashionMNIST(
root='./dataMnist'
,train=False
,download=True
,transform=transforms.Compose([
transforms.ToTensor(),
transforms.Resize((224, 224))
])
)
test_loader = torch.utils.data.DataLoader(
test_set,
batch_size=32,
shuffle=True,
num_workers = 0
)
#Flatten拉平
net = nn.Sequential(nn.Conv2d(in_channels=1, out_channels=96, kernel_size=11, stride=4, padding=1), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2), #54*54 --26
nn.Conv2d(96, 256, kernel_size=5, padding=2), nn.ReLU(),#26*26
nn.MaxPool2d(kernel_size=3, stride=2),#12*12
nn.Conv2d(256, 384,kernel_size=3, padding=1), nn.ReLU(),
# nn.Conv2d(384, 384,kernel_size=3, padding=1), nn.ReLU(),
# nn.Conv2d(384, 384,kernel_size=3, padding=1), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Flatten(),
nn.Linear(9600, 4096), nn.ReLU(),nn.Dropout(p=0.5),
# nn.Linear(4096, 4096), nn.ReLU(),nn.Dropout(p=0.5),
nn.Linear(4096, 10))
# x = torch.zeros((1, 3, 224, 224))
# for layer in net:
# x = layer(x)
# print(layer.__class__.__name__, "\t ouput_size:", x.shape)
# #交叉熵损失函数中传递未规范化的预测,并同时计算softmax及其对数
# #参数的初始化
def init_weights(m):
if(type(m))==nn.Linear:
nn.init.normal_(m.weight, std=0.01)
net.apply(init_weights)
optimizer = optim.SGD(net.parameters(), lr = 0.12)
loss = nn.CrossEntropyLoss(reduction='mean')
epoch = 10
losses = []
for i in range(epoch):
loss_sum = 0
for x, y in train_loader:
net = net.to(device)
x = x.to(device)
y = y.to(device)
y_hat = net(x)
loss_temp = loss(y_hat, y)
loss_sum += loss_temp
optimizer.zero_grad()
loss_temp.backward()
optimizer.step()
losses.append(loss_sum.cpu().detach().numpy()/train_set.data.shape[0])
print("epoch: ",i, "loss=", loss_sum.item())
acc = 0
with torch.no_grad():
for x, y in test_loader:
x = x.to(device)
y = y.to(device)
y_hat = net(x)
acc += torch.sum(y_hat.argmax(dim=1).type(y.dtype) == y)
print("测试集准确度",(acc/test_set.data.shape[0]).item())
plt.plot(range(epoch), losses)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
epoch: 0 loss= 980.5542602539062
epoch: 1 loss= 567.556396484375
epoch: 2 loss= 486.2161865234375
epoch: 3 loss= 434.8039855957031
epoch: 4 loss= 394.9348449707031
epoch: 5 loss= 358.361083984375
epoch: 6 loss= 324.60540771484375
epoch: 7 loss= 295.7160339355469
epoch: 8 loss= 268.5332336425781
epoch: 9 loss= 247.34283447265625
测试集准确度 0.9116999506950378
修改批量大小,并观察模型精度和GPU显存变化。
随着batchsize增大,GPU所使用的内存也增多,占用率随着增大。
在AlexNet中主要是哪部分占用显存?
图片数据和中间结果?
在AlexNet中主要是哪部分需要更多的计算?
卷积层
vgg
我们只看到8个结果,而不是11个结果。剩余的3层信息去哪了?
与AlexNet相比,VGG的计算要慢得多,而且它还需要更多的显存。分析出现这种情况的原因。
VGG更深,卷积层更多,卷积是一件很贵的事情。
尝试将Fashion-MNIST数据集图像的高度和宽度从224改为96。这对实验有什么影响?
那么VGG的网络不能直接使用,需要据此修改一下,调整一下尺寸变化的过程。
请参考VGG论文 :cite:Simonyan.Zisserman.2014中的表1构建其他常见模型,如VGG-16或VGG-19。
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
device = torch.device('cuda:0')
train_set = torchvision.datasets.FashionMNIST(
root='./dataMnist'
,train=True
,download=True
,transform=transforms.Compose([
transforms.ToTensor(),
transforms.Resize((224, 224))
])
)
train_loader = torch.utils.data.DataLoader(
train_set,
batch_size=64,
shuffle=True,
num_workers = 0
)
test_set = torchvision.datasets.FashionMNIST(
root='./dataMnist'
,train=False
,download=True
,transform=transforms.Compose([
transforms.ToTensor(),
transforms.Resize((224, 224))
])
)
test_loader = torch.utils.data.DataLoader(
test_set,
batch_size=64,
shuffle=True,
num_workers = 0
)
#创建VGG块
def vgg_block(num_convs, in_channels, out_channels):
block = []
for _ in range(num_convs):
block.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
block.append(nn.ReLU())
in_channels = out_channels
block.append(nn.MaxPool2d(kernel_size=2, stride=2))
return nn.Sequential(*block)
con_arc_11 = [(1, int(64//4)), (1, 128//4), (2, 256//4), (2, 512//4), (2, 512//4)] #vgg11
con_arc_16 = [(2, 64//4), (2, 128//4), (3, 256//4), (3, 512//4), (3, 512//4)] #vgg16
con_arc_19 = [(2, 64//4), (2, 128//4), (4, 256//4), (4, 512//4), (4, 512//4)] #vgg19
def vgg(con_arc, in_channels):
net = []
for (num_convs, out_channels) in con_arc:
net.append(vgg_block(num_convs, in_channels, out_channels))
in_channels = out_channels
return nn.Sequential(*net, nn.Flatten(),
nn.Linear(6272, 4096), nn.ReLU(), nn.Dropout(p=0.5),
nn.Linear(4096, 4096), nn.ReLU(), nn.Dropout(p=0.5),
nn.Linear(4096, 10))
net = vgg(con_arc_16, 1)
x = torch.zeros((1, 1, 224, 224))
for layer in net:
x = layer(x)
print(layer.__class__.__name__, "\t输出的格式为: ", x.shape)
print("vgg16的结构为:", net)
optimizer = optim.SGD(net.parameters(), lr = 0.12)
loss = nn.CrossEntropyLoss(reduction='mean')
epoch = 10
losses = []
for i in range(epoch):
loss_sum = 0
for x, y in train_loader:
net = net.to(device)
x = x.to(device)
y = y.to(device)
y_hat = net(x)
loss_temp = loss(y_hat, y)
loss_sum += loss_temp
optimizer.zero_grad()
loss_temp.backward()
optimizer.step()
losses.append(loss_sum.cpu().detach().numpy()/train_set.data.shape[0])
print("epoch: ",i, "loss=", loss_sum.item())
acc = 0
with torch.no_grad():
for x, y in test_loader:
x = x.to(device)
y = y.to(device)
y_hat = net(x)
acc += torch.sum(y_hat.argmax(dim=1).type(y.dtype) == y)
print("测试集准确度",(acc/test_set.data.shape[0]).item())
plt.plot(range(epoch), losses)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
'''
epoch: 0 loss= 2160.221923828125
epoch: 1 loss= 790.5088500976562
epoch: 2 loss= 310.9482116699219
epoch: 3 loss= 251.4301300048828
epoch: 4 loss= 213.0938262939453
epoch: 5 loss= 185.2352294921875
epoch: 6 loss= 162.23760986328125
epoch: 7 loss= 138.5640869140625
epoch: 8 loss= 118.95980072021484
epoch: 9 loss= 102.10594177246094
测试集准确度 0.9230999946594238
'''
nin
调整NiN的超参数,以提高分类准确性。
依旧使用SGD,学习率调为0.12, epoch 15
epoch: 0 loss= 949.5301513671875
epoch: 1 loss= 541.0523071289062
epoch: 2 loss= 335.288818359375
epoch: 3 loss= 265.7752990722656
epoch: 4 loss= 236.0061492919922
epoch: 5 loss= 214.9103546142578
epoch: 6 loss= 200.35089111328125
epoch: 7 loss= 186.38710021972656
epoch: 8 loss= 173.8882293701172
epoch: 9 loss= 164.55499267578125
epoch: 10 loss= 157.58424377441406
epoch: 11 loss= 150.84255981445312
epoch: 12 loss= 145.72850036621094
epoch: 13 loss= 139.71893310546875
epoch: 14 loss= 134.76446533203125
测试集准确度 0.8855999708175659
为什么NiN块中有两个 1×1 卷积层?删除其中一个,然后观察和分析实验现象。
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
device = torch.device('cuda:0')
train_set = torchvision.datasets.FashionMNIST(
root='./dataMnist'
,train=True
,download=True
,transform=transforms.Compose([
transforms.ToTensor(),
transforms.Resize((224, 224))
])
)
train_loader = torch.utils.data.DataLoader(
train_set,
batch_size=128,
shuffle=True,
num_workers = 0
)
test_set = torchvision.datasets.FashionMNIST(
root='./dataMnist'
,train=False
,download=True
,transform=transforms.Compose([
transforms.ToTensor(),
transforms.Resize((224, 224))
])
)
test_loader = torch.utils.data.DataLoader(
test_set,
batch_size=128,
shuffle=True,
num_workers = 0
)
#创建Nin块
def Nin_block(in_channels, out_channels, padding, stride, kernel_size):
return nn.Sequential(
nn.Conv2d(in_channels, out_channels, padding=padding ,stride=stride, kernel_size=kernel_size),
nn.ReLU(),
nn.Conv2d(out_channels, out_channels, kernel_size=1),nn.ReLU(),
nn.Conv2d(out_channels, out_channels, kernel_size=1), nn.ReLU()
)
def Nin():
return nn.Sequential(
Nin_block(1, 96, stride=4, kernel_size=11, padding=0),
nn.MaxPool2d(kernel_size=3, stride=2),
Nin_block(96, 256, stride=1, kernel_size=5, padding=2),
nn.MaxPool2d(kernel_size=3, stride=2),
Nin_block(256, 384, stride=1, kernel_size=3, padding=1),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Dropout(0.5),
Nin_block(384, 10, stride=1, kernel_size=3, padding=1),
nn.AdaptiveAvgPool2d((1, 1)),
nn.Flatten() #去掉多余的维数
)
net = Nin()
# x = torch.zeros((1, 1, 224, 224))
# for layer in net:
# x = layer(x)
# print(layer.__class__.__name__, "\t输出的格式为: ", x.shape)
def init_weights(layer):
if type(layer)== nn.Linear or type(layer) == nn.Conv2d:
nn.init.xavier_uniform_(layer.weight) #初始化很重要,NiN随机初始化训练不动。。。
net.apply(init_weights)
print("Nin的结构为:", net)
optimizer = optim.SGD(net.parameters(), lr = 0.1)
loss = nn.CrossEntropyLoss(reduction='mean')
epoch = 10
losses = []
for i in range(epoch):
loss_sum = 0
for x, y in train_loader:
net = net.to(device)
x = x.to(device)
y = y.to(device)
y_hat = net(x)
loss_temp = loss(y_hat, y)
loss_sum += loss_temp
optimizer.zero_grad()
loss_temp.backward()
optimizer.step()
losses.append(loss_sum.cpu().detach().numpy()/train_set.data.shape[0])
print("epoch: ",i, "loss=", loss_sum.item())
acc = 0
with torch.no_grad():
for x, y in test_loader:
x = x.to(device)
y = y.to(device)
y_hat = net(x)
acc += torch.sum(y_hat.argmax(dim=1).type(y.dtype) == y)
print("测试集准确度",(acc/test_set.data.shape[0]).item())
plt.plot(range(epoch), losses)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
epoch: 0 loss= 993.8709716796875
epoch: 1 loss= 533.2142944335938
epoch: 2 loss= 361.1051330566406
epoch: 3 loss= 277.5993347167969
epoch: 4 loss= 232.45095825195312
epoch: 5 loss= 205.37686157226562
epoch: 6 loss= 187.60452270507812
epoch: 7 loss= 176.19105529785156
epoch: 8 loss= 165.3572540283203
epoch: 9 loss= 157.9745330810547
测试集准确度 0.8700000047683716
俩个可以多次融合通道信息
换成一个:
Nin的结构为: Sequential(
(0): Sequential(
(0): Conv2d(1, 96, kernel_size=(11, 11), stride=(4, 4))
(1): ReLU()
(2): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1))
(3): ReLU()
)
(1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
(2): Sequential(
(0): Conv2d(96, 256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
(1): ReLU()
(2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
(3): ReLU()
)
(3): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
(4): Sequential(
(0): Conv2d(256, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): ReLU()
(2): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1))
(3): ReLU()
)
(5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
(6): Dropout(p=0.5, inplace=False)
(7): Sequential(
(0): Conv2d(384, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): ReLU()
(2): Conv2d(10, 10, kernel_size=(1, 1), stride=(1, 1))
(3): ReLU()
)
(8): AdaptiveAvgPool2d(output_size=(1, 1))
(9): Flatten(start_dim=1, end_dim=-1)
)
epoch: 0 loss= 847.5736694335938
epoch: 1 loss= 356.9827575683594
epoch: 2 loss= 261.1847839355469
epoch: 3 loss= 224.17762756347656
epoch: 4 loss= 196.07040405273438
epoch: 5 loss= 180.1049041748047
epoch: 6 loss= 171.22512817382812
epoch: 7 loss= 160.30470275878906
epoch: 8 loss= 153.3402099609375
epoch: 9 loss= 147.3529052734375
测试集准确度 0.8526999950408936
换成一个效果略低,俩个可以增加模型的非线性表达能力。
计算NiN的资源使用情况。参数的数量是多少?计算量是多少?
参数 96 ( 1 ∗ 11 ∗ 11 + 1 ) + ( 1 ∗ 1 ∗ 96 ∗ 96 ) ∗ 2 + 256 ∗ ( 96 ∗ 5 ∗ 5 + 1 ) + ( 1 ∗ 1 ∗ 256 ∗ 256 ) ∗ 2 + 384 ( 256 ∗ 3 ∗ 3 + 1 ) + ( 1 ∗ 1 ∗ 384 ∗ 384 ) ∗ 2 + 10 ( 384 ∗ 3 ∗ 3 + 1 ) + 2 ∗ ( 1 ∗ 1 ∗ 10 ∗ 10 ) = 1995284 96(1*11*11+1)+(1*1*96*96)*2+256*(96*5*5+1)+(1*1*256*256)*2+384(256*3*3+1)+(1*1*384*384)*2+10(384*3*3+1)+2*(1*1*10*10)=1995284 96(1∗11∗11+1)+(1∗1∗96∗96)∗2+256∗(96∗5∗5+1)+(1∗1∗256∗256)∗2+384(256∗3∗3+1)+(1∗1∗384∗384)∗2+10(384∗3∗3+1)+2∗(1∗1∗10∗10)=1995284
计算量: ( 54 ∗ 54 ∗ 96 ∗ 11 ∗ 11 ) + ( 54 ∗ 54 ∗ 96 ∗ 96 ∗ 96 ) ∗ 2 + ( 26 ∗ 26 ∗ 256 ∗ 5 ∗ 5 ) + ( 26 ∗ 26 ∗ 256 ∗ 256 ∗ 256 ) ∗ 2 + ( 12 ∗ 12 ∗ 384 ∗ 3 ∗ 3 ) + ( 12 ∗ 12 ∗ 384 ∗ 384 ∗ 384 ) + ( 10 ∗ 384 ∗ 5 ∗ 5 ∗ 3 ∗ 3 ) + ( 10 ∗ 5 ∗ 5 ∗ 10 ∗ 10 ) ∗ 2 = 2.469 ∗ 1 0 10 (54*54*96*11*11)+(54*54*96*96*96)*2+(26*26*256*5*5)+(26*26*256*256*256)*2+(12*12*384*3*3)+(12*12*384*384*384)+(10*384*5*5*3*3)+(10*5*5*10*10)*2 = 2.469*10^{10} (54∗54∗96∗11∗11)+(54∗54∗96∗96∗96)∗2+(26∗26∗256∗5∗5)+(26∗26∗256∗256∗256)∗2+(12∗12∗384∗3∗3)+(12∗12∗384∗384∗384)+(10∗384∗5∗5∗3∗3)+(10∗5∗5∗10∗10)∗2=2.469∗1010
一次性直接将 384×5×5 的表示缩减为 10×5×5 的表示,会存在哪些问题?
信息损失过大?
googlenet
训练结果
epoch: 0 loss= 1633.828857421875
epoch: 1 loss= 803.0449829101562
epoch: 2 loss= 605.2086181640625
epoch: 3 loss= 472.7867126464844
epoch: 4 loss= 409.02154541015625
epoch: 5 loss= 368.7477722167969
epoch: 6 loss= 339.0345458984375
epoch: 7 loss= 315.32861328125
epoch: 8 loss= 297.1580810546875
epoch: 9 loss= 287.3929748535156
测试集准确度 0.880899965763092
使用GoogLeNet的最小图像大小是多少?
只有前面的block1缩小了图片的大小,则最小图片大小为55,经过77卷积变33,再经过3 * 3的池化变11.
将AlexNet、VGG和NiN的模型参数大小与GoogLeNet进行比较。后两个网络架构是如何显著减少模型参数大小的?
NiN是采用11卷积替换全连接,VGG则是用多个小卷积核替代大的卷积核。(Googlenet也采用了11先降低通道维数减少计算参数的方式)
batch-norm
在使用批量规范化之前,我们是否可以从全连接层或卷积层中删除偏置参数?为什么?
我认为是可以的,偏执最终也会被作为平均值中的一部分然后被减掉。
比较LeNet在使用和不使用批量规范化情况下的学习率
使用 :1.0 准确度 85.7%
不使用: 0.3 73.3%
我们是否需要在每个层中进行批量规范化?尝试一下?
只留了最后俩个batch-norm
lr:0.3 51.3% 测试集准确度大起大落不稳定
只留了前俩个batch-norm
lr:0.3 80.9% 测试集准确度较为稳定,说明相较于加在后面还是应该加在前面,可以确保数值稳定性。
你可以通过批量规范化来替换暂退法吗?行为会如何改变?
net = nn.Sequential(
nn.Conv2d(1, 6, kernel_size=5), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Conv2d(6, 16, kernel_size=5), nn.Dropout(0.5), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2), nn.Flatten(),
nn.Linear(256, 120), nn.Sigmoid(),
nn.Linear(120, 84), nn.Sigmoid(),
nn.Linear(84, 10))
lr:0.3 测试准确度 60.6%
resnet
numref:fig_inception中的Inception块与残差块之间的主要区别是什么?在删除了Inception块中的一些路径之后,它们是如何相互关联的?
个人所见,尽管俩个网络的构建思路不同,Inception是为了使用不同的卷积核去提取不同的特征(同时使用少的参数),但是Resnet模块是为了保证更深的网络表达的函数囊括浅层的函数,使得更深的网络有意义。但是在架构上,可以将Resnet看作是一种特殊的Inception模块。
参考ResNet论文 :cite:He.Zhang.Ren.ea.2016中的表1,以实现不同的变体。
贴一下Resnet 18的训练结果
epoch: 0 loss= 274.33294677734375
epoch: 1 loss= 123.11670684814453
epoch: 2 loss= 98.39700317382812
epoch: 3 loss= 81.01783752441406
epoch: 4 loss= 65.5454330444336
epoch: 5 loss= 51.666439056396484
epoch: 6 loss= 37.66799545288086
epoch: 7 loss= 27.368988037109375
epoch: 8 loss= 18.34890365600586
epoch: 9 loss= 10.552614212036133
epoch: 10 loss= 9.15225601196289
epoch: 11 loss= 5.566713809967041
epoch: 12 loss= 2.6198325157165527
epoch: 13 loss= 0.49261292815208435
epoch: 14 loss= 0.2144828885793686
测试集准确度 0.9327999949455261
实现一下Resnet-34:
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
device = torch.device('cuda:0')
train_set = torchvision.datasets.FashionMNIST(
root='./dataMnist'
,train=True
,download=True
,transform=transforms.Compose([
transforms.ToTensor(),
transforms.Resize((224, 224))
])
)
train_loader = torch.utils.data.DataLoader(
train_set,
batch_size=128,
shuffle=True,
num_workers = 0
)
test_set = torchvision.datasets.FashionMNIST(
root='./dataMnist'
,train=False
,download=True
,transform=transforms.Compose([
transforms.ToTensor(),
transforms.Resize((224, 224))
])
)
test_loader = torch.utils.data.DataLoader(
test_set,
batch_size=128,
shuffle=True,
num_workers = 0
)
#创建Resnet_block
class ResBlock(nn.Module):
def __init__(self, in_channels, out_channels, b2=False, first_block = True):
super().__init__()
if(first_block and not b2):
stride = 2
self.conv_one = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, padding=0)
else:
self.conv_one = None
stride = 1
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
self.bn1 = nn.BatchNorm2d(out_channels)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
self.bn2 = nn.BatchNorm2d(out_channels)
def forward(self, x):
y = F.relu(self.bn1(self.conv1(x)))
y = self.bn2(self.conv2(y))
if(self.conv_one):
out = F.relu(y + self.conv_one(x))
else:
out = F.relu(y + x)
return out
def ResBlocks(nums, b2, in_channels, out_channels):
block = []
for i in range(nums):
if(i == 0):
block.append(ResBlock(in_channels, out_channels, b2, first_block = True))
else:
block.append(ResBlock(out_channels, out_channels, b2, first_block = False))
return nn.Sequential(*block)
b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
b2 = ResBlocks(3, True, 64, 64)
b3 = ResBlocks(4, False, 64, 128)
b4 = ResBlocks(6, False, 128, 256)
b5 = ResBlocks(3, False, 256, 512)
net = nn.Sequential(b1, b2, b3, b4, b5, nn.AdaptiveAvgPool2d((1,1)), nn.Flatten(), nn.Linear(512, 10))
x = torch.zeros((1, 1, 224, 224))
for layer in net:
x = layer(x)
print(layer.__class__.__name__, "\t输出的形状为:", x.shape)
def init_weights(layer):
if type(layer)== nn.Linear or type(layer) == nn.Conv2d:
nn.init.xavier_uniform_(layer.weight) #初始化很重要,NiN随机初始化训练不动。。。
net.apply(init_weights)
print("Resnet18的结构为:", net)
optimizer = optim.SGD(net.parameters(), lr = 0.12)
loss = nn.CrossEntropyLoss(reduction='mean')
epoch = 10
losses = []
for i in range(epoch):
loss_sum = 0
for x, y in train_loader:
net = net.to(device)
x = x.to(device)
y = y.to(device)
y_hat = net(x)
loss_temp = loss(y_hat, y)
loss_sum += loss_temp
optimizer.zero_grad()
loss_temp.backward()
optimizer.step()
losses.append(loss_sum.cpu().detach().numpy()/train_set.data.shape[0])
print("epoch: ",i, "loss=", loss_sum.item())
acc = 0
with torch.no_grad():
for x, y in test_loader:
x = x.to(device)
y = y.to(device)
y_hat = net(x)
acc += torch.sum(y_hat.argmax(dim=1).type(y.dtype) == y)
print("测试集准确度",(acc/test_set.data.shape[0]).item())
plt.plot(range(epoch), losses)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
epoch: 0 loss= 379.17254638671875
epoch: 1 loss= 152.3948211669922
epoch: 2 loss= 121.77942657470703
epoch: 3 loss= 103.70003509521484
epoch: 4 loss= 89.28633117675781
epoch: 5 loss= 75.78622436523438
epoch: 6 loss= 62.75402069091797
epoch: 7 loss= 53.15045928955078
epoch: 8 loss= 42.1845703125
epoch: 9 loss= 34.198307037353516
测试集准确度 0.9125999808311462
对于更深层次的网络,ResNet引入了“bottleneck”架构来降低模型复杂性。请你试着去实现它。
x = torch.zeros((1, 1, 224, 224))
class bottleneck(nn.Module):
def __init__(self, c_num, conv_skip = True, stride = 1):
super().__init__()
self.conv_layer = nn.Sequential(
nn.Conv2d(c_num[0], c_num[1], kernel_size=1, padding=0, stride=1),
nn.BatchNorm2d(c_num[1]),
nn.ReLU(),
nn.Conv2d(c_num[1], c_num[1], kernel_size=3, padding=1, stride=stride),
nn.BatchNorm2d(c_num[1]),
nn.ReLU(),
nn.Conv2d(c_num[1], c_num[2], kernel_size=1, padding=0, stride=1),
nn.BatchNorm2d(c_num[2]),
nn.ReLU())
if(conv_skip):
self.conv_skip = nn.Conv2d(c_num[0], c_num[2], kernel_size=1, padding=0, stride=stride)
else:
self.conv_skip = None
def forward(self, x):
y = self.conv_layer(x)
if(self.conv_skip):
out = y + self.conv_skip(x)
else:
out = y + x
return out
def bottle_block(block_num, c_num, b2 = False):
block = []
for i in range(block_num):
if(i == 0 and not b2):
block.append(bottleneck(c_num, True, stride=2))
else:
block.append(bottleneck([c_num[2], c_num[1], c_num[2]], False))
return nn.Sequential(*block)
b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
b2 = bottle_block(3, [64, 64, 256])
b3 = bottle_block(4, [256, 128, 512])
b4 = bottle_block(6, [512, 256, 1024])
b5 = bottle_block(3, [1024, 512, 2048])
net = nn.Sequential(b1, b2, b3, b4, b5, nn.AdaptiveAvgPool2d((1,1)), nn.Flatten(), nn.Linear(2048, 10))
for layer in net:
x = layer(x)
print(layer.__class__.__name__, "\t输出的形状为:", x.shape)
Sequential 输出的形状为: torch.Size([1, 64, 56, 56])
Sequential 输出的形状为: torch.Size([1, 256, 28, 28])
Sequential 输出的形状为: torch.Size([1, 512, 14, 14])
Sequential 输出的形状为: torch.Size([1, 1024, 7, 7])
Sequential 输出的形状为: torch.Size([1, 2048, 4, 4])
AdaptiveAvgPool2d 输出的形状为: torch.Size([1, 2048, 1, 1])
Flatten 输出的形状为: torch.Size([1, 2048])
Linear 输出的形状为: torch.Size([1, 10])
为什么即使函数类是嵌套的,我们仍然要限制增加函数的复杂性呢?
- 如无必要,勿增实体,拟合能力过强的函数会导致过拟合。
- 复杂的函数会消耗大量的计算资源
- 可解释性会变差
Densenet
先训练一下Densenet
epoch: 0 loss= 574.0405883789062
epoch: 1 loss= 304.017578125
epoch: 2 loss= 242.1274871826172
epoch: 3 loss= 208.28538513183594
epoch: 4 loss= 185.4270782470703
epoch: 5 loss= 170.85366821289062
epoch: 6 loss= 156.8195343017578
epoch: 7 loss= 142.7688446044922
epoch: 8 loss= 132.4578857421875
epoch: 9 loss= 120.2202377319336
epoch: 10 loss= 110.72298431396484
epoch: 11 loss= 103.44509887695312
epoch: 12 loss= 94.1830825805664
epoch: 13 loss= 88.94744110107422
epoch: 14 loss= 80.26952362060547
测试集准确度 0.877299964427948
为什么我们在过渡层使用平均汇聚层而不是最大汇聚层?
将平均汇聚改为最大汇聚:
def conv_dense(in_channels, out_channels):
return nn.Sequential(
nn.BatchNorm2d(in_channels),
nn.ReLU(),
nn.Conv2d(in_channels, 128, kernel_size=1, padding=0, stride=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.Conv2d(128, out_channels, kernel_size=3, stride=1, padding=1))
class DesBlock(nn.Module):
def __init__(self, nums, in_channels, out_channels):
super().__init__()
self.blocks =[]
self.blocks = self.blocks
for i in range(nums):
self.blocks.append(conv_dense(i*out_channels + in_channels, out_channels))
self.net = nn.Sequential(*self.blocks)
def forward(self, x):
for block in self.net:
y = block(x)
x = torch.cat((x, y), dim=1)
return x
def transition(in_channels):
out_channels = in_channels//2
return nn.Sequential(
nn.BatchNorm2d(in_channels),nn.ReLU(),
nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0, stride=1),
nn.MaxPool2d(kernel_size=2, stride=2)
)
epoch: 0 loss= 515.3784790039062
epoch: 1 loss= 282.50244140625
epoch: 2 loss= 232.67477416992188
epoch: 3 loss= 202.9937286376953
epoch: 4 loss= 178.63426208496094
epoch: 5 loss= 161.4530029296875
epoch: 6 loss= 147.14990234375
epoch: 7 loss= 129.9204864501953
epoch: 8 loss= 117.783203125
epoch: 9 loss= 106.5365219116211
epoch: 10 loss= 94.76472473144531
epoch: 11 loss= 87.81078338623047
epoch: 12 loss= 78.73601531982422
epoch: 13 loss= 69.39637756347656
epoch: 14 loss= 62.99326705932617
测试集准确度 0.9085999727249146
准确度更高。
DenseNet的优点之一是其模型参数比ResNet小。为什么呢?
重复使用了特征图,使得无需重复学习的参数。
DenseNet一个诟病的问题是内存或显存消耗过多。真的是这样吗?可以把输入形状换成 224×224 ,来看看实际的显存消耗。
是这样的,可能因为要保存许多的特征图,6G显存不太够,需要降低batchsize。
现代卷积神经网络
深度卷积神经网络(AlexNet)
试着增加迭代轮数。对比LeNet的结果有什么不同?为什么?
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
device = torch.device('cuda:0')
train_set = torchvision.datasets.FashionMNIST(
root='./dataMnist'
,train=True
,download=True
,transform=transforms.Compose([
transforms.ToTensor(),
transforms.Resize((224, 224))
])
)
train_loader = torch.utils.data.DataLoader(
train_set,
batch_size=32,
shuffle=True,
num_workers = 0
)
test_set = torchvision.datasets.FashionMNIST(
root='./dataMnist'
,train=False
,download=True
,transform=transforms.Compose([
transforms.ToTensor(),
transforms.Resize((224, 224))
])
)
test_loader = torch.utils.data.DataLoader(
test_set,
batch_size=32,
shuffle=True,
num_workers = 0
)
#Flatten拉平
net = nn.Sequential(nn.Conv2d(in_channels=1, out_channels=96, kernel_size=11, stride=4, padding=1), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2), #54*54 --26
nn.Conv2d(96, 256, kernel_size=5, padding=2), nn.ReLU(),#26*26
nn.MaxPool2d(kernel_size=3, stride=2),#12*12
nn.Conv2d(256, 384,kernel_size=3, padding=1), nn.ReLU(),
nn.Conv2d(384, 384,kernel_size=3, padding=1), nn.ReLU(),
nn.Conv2d(384, 384,kernel_size=3, padding=1), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Flatten(),
nn.Linear(9600, 4096), nn.ReLU(),nn.Dropout(p=0.5),
nn.Linear(4096, 4096), nn.ReLU(),nn.Dropout(p=0.5),
nn.Linear(4096, 10))
# x = torch.zeros((1, 3, 224, 224))
# for layer in net:
# x = layer(x)
# print(layer.__class__.__name__, "\t ouput_size:", x.shape)
# #交叉熵损失函数中传递未规范化的预测,并同时计算softmax及其对数
# #参数的初始化
def init_weights(m):
if(type(m))==nn.Linear:
nn.init.normal_(m.weight, std=0.01)
net.apply(init_weights)
optimizer = optim.SGD(net.parameters(), lr = 0.12)
loss = nn.CrossEntropyLoss(reduction='mean')
epoch = 10
losses = []
for i in range(epoch):
loss_sum = 0
for x, y in train_loader:
net = net.to(device)
x = x.to(device)
y = y.to(device)
y_hat = net(x)
loss_temp = loss(y_hat, y)
loss_sum += loss_temp
optimizer.zero_grad()
loss_temp.backward()
optimizer.step()
losses.append(loss_sum.cpu().detach().numpy()/train_set.data.shape[0])
print("epoch: ",i, "loss=", loss_sum.item())
acc = 0
with torch.no_grad():
for x, y in test_loader:
x = x.to(device)
y = y.to(device)
y_hat = net(x)
acc += torch.sum(y_hat.argmax(dim=1).type(y.dtype) == y)
print("测试集准确度",(acc/test_set.data.shape[0]).item())
plt.plot(range(epoch), losses)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
‘’’
epoch: 0 loss= 1712.237548828125
epoch: 1 loss= 659.957275390625
epoch: 2 loss= 551.9481201171875
epoch: 3 loss= 484.9080810546875
epoch: 4 loss= 433.572265625
epoch: 5 loss= 393.9735107421875
epoch: 6 loss= 361.9253234863281
epoch: 7 loss= 331.5981140136719
epoch: 8 loss= 302.447265625
epoch: 9 loss= 276.08563232421875
测试集准确度 tensor(0.9043, device=‘cuda:0’)
‘’’
比lenet效果更好,alexnet学习能力更强。
AlexNet对于Fashion-MNIST数据集来说可能太复杂了。尝试简化模型以加快训练速度,同时确保准确性不会显著下降。设计一个更好的模型,可以直接在 28×28 图像上工作。
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
device = torch.device('cuda:0')
train_set = torchvision.datasets.FashionMNIST(
root='./dataMnist'
,train=True
,download=True
,transform=transforms.Compose([
transforms.ToTensor(),
transforms.Resize((224, 224))
])
)
train_loader = torch.utils.data.DataLoader(
train_set,
batch_size=32,
shuffle=True,
num_workers = 0
)
test_set = torchvision.datasets.FashionMNIST(
root='./dataMnist'
,train=False
,download=True
,transform=transforms.Compose([
transforms.ToTensor(),
transforms.Resize((224, 224))
])
)
test_loader = torch.utils.data.DataLoader(
test_set,
batch_size=32,
shuffle=True,
num_workers = 0
)
#Flatten拉平
net = nn.Sequential(nn.Conv2d(in_channels=1, out_channels=96, kernel_size=11, stride=4, padding=1), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2), #54*54 --26
nn.Conv2d(96, 256, kernel_size=5, padding=2), nn.ReLU(),#26*26
nn.MaxPool2d(kernel_size=3, stride=2),#12*12
nn.Conv2d(256, 384,kernel_size=3, padding=1), nn.ReLU(),
# nn.Conv2d(384, 384,kernel_size=3, padding=1), nn.ReLU(),
# nn.Conv2d(384, 384,kernel_size=3, padding=1), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Flatten(),
nn.Linear(9600, 4096), nn.ReLU(),nn.Dropout(p=0.5),
# nn.Linear(4096, 4096), nn.ReLU(),nn.Dropout(p=0.5),
nn.Linear(4096, 10))
# x = torch.zeros((1, 3, 224, 224))
# for layer in net:
# x = layer(x)
# print(layer.__class__.__name__, "\t ouput_size:", x.shape)
# #交叉熵损失函数中传递未规范化的预测,并同时计算softmax及其对数
# #参数的初始化
def init_weights(m):
if(type(m))==nn.Linear:
nn.init.normal_(m.weight, std=0.01)
net.apply(init_weights)
optimizer = optim.SGD(net.parameters(), lr = 0.12)
loss = nn.CrossEntropyLoss(reduction='mean')
epoch = 10
losses = []
for i in range(epoch):
loss_sum = 0
for x, y in train_loader:
net = net.to(device)
x = x.to(device)
y = y.to(device)
y_hat = net(x)
loss_temp = loss(y_hat, y)
loss_sum += loss_temp
optimizer.zero_grad()
loss_temp.backward()
optimizer.step()
losses.append(loss_sum.cpu().detach().numpy()/train_set.data.shape[0])
print("epoch: ",i, "loss=", loss_sum.item())
acc = 0
with torch.no_grad():
for x, y in test_loader:
x = x.to(device)
y = y.to(device)
y_hat = net(x)
acc += torch.sum(y_hat.argmax(dim=1).type(y.dtype) == y)
print("测试集准确度",(acc/test_set.data.shape[0]).item())
plt.plot(range(epoch), losses)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
epoch: 0 loss= 980.5542602539062
epoch: 1 loss= 567.556396484375
epoch: 2 loss= 486.2161865234375
epoch: 3 loss= 434.8039855957031
epoch: 4 loss= 394.9348449707031
epoch: 5 loss= 358.361083984375
epoch: 6 loss= 324.60540771484375
epoch: 7 loss= 295.7160339355469
epoch: 8 loss= 268.5332336425781
epoch: 9 loss= 247.34283447265625
测试集准确度 0.9116999506950378
修改批量大小,并观察模型精度和GPU显存变化。
随着batchsize增大,GPU所使用的内存也增多,占用率随着增大。
在AlexNet中主要是哪部分占用显存?
图片数据和中间结果?
在AlexNet中主要是哪部分需要更多的计算?
卷积层
vgg
我们只看到8个结果,而不是11个结果。剩余的3层信息去哪了?
与AlexNet相比,VGG的计算要慢得多,而且它还需要更多的显存。分析出现这种情况的原因。
VGG更深,卷积层更多,卷积是一件很贵的事情。
尝试将Fashion-MNIST数据集图像的高度和宽度从224改为96。这对实验有什么影响?
那么VGG的网络不能直接使用,需要据此修改一下,调整一下尺寸变化的过程。
请参考VGG论文 :cite:Simonyan.Zisserman.2014中的表1构建其他常见模型,如VGG-16或VGG-19。
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
device = torch.device('cuda:0')
train_set = torchvision.datasets.FashionMNIST(
root='./dataMnist'
,train=True
,download=True
,transform=transforms.Compose([
transforms.ToTensor(),
transforms.Resize((224, 224))
])
)
train_loader = torch.utils.data.DataLoader(
train_set,
batch_size=64,
shuffle=True,
num_workers = 0
)
test_set = torchvision.datasets.FashionMNIST(
root='./dataMnist'
,train=False
,download=True
,transform=transforms.Compose([
transforms.ToTensor(),
transforms.Resize((224, 224))
])
)
test_loader = torch.utils.data.DataLoader(
test_set,
batch_size=64,
shuffle=True,
num_workers = 0
)
#创建VGG块
def vgg_block(num_convs, in_channels, out_channels):
block = []
for _ in range(num_convs):
block.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
block.append(nn.ReLU())
in_channels = out_channels
block.append(nn.MaxPool2d(kernel_size=2, stride=2))
return nn.Sequential(*block)
con_arc_11 = [(1, int(64//4)), (1, 128//4), (2, 256//4), (2, 512//4), (2, 512//4)] #vgg11
con_arc_16 = [(2, 64//4), (2, 128//4), (3, 256//4), (3, 512//4), (3, 512//4)] #vgg16
con_arc_19 = [(2, 64//4), (2, 128//4), (4, 256//4), (4, 512//4), (4, 512//4)] #vgg19
def vgg(con_arc, in_channels):
net = []
for (num_convs, out_channels) in con_arc:
net.append(vgg_block(num_convs, in_channels, out_channels))
in_channels = out_channels
return nn.Sequential(*net, nn.Flatten(),
nn.Linear(6272, 4096), nn.ReLU(), nn.Dropout(p=0.5),
nn.Linear(4096, 4096), nn.ReLU(), nn.Dropout(p=0.5),
nn.Linear(4096, 10))
net = vgg(con_arc_16, 1)
x = torch.zeros((1, 1, 224, 224))
for layer in net:
x = layer(x)
print(layer.__class__.__name__, "\t输出的格式为: ", x.shape)
print("vgg16的结构为:", net)
optimizer = optim.SGD(net.parameters(), lr = 0.12)
loss = nn.CrossEntropyLoss(reduction='mean')
epoch = 10
losses = []
for i in range(epoch):
loss_sum = 0
for x, y in train_loader:
net = net.to(device)
x = x.to(device)
y = y.to(device)
y_hat = net(x)
loss_temp = loss(y_hat, y)
loss_sum += loss_temp
optimizer.zero_grad()
loss_temp.backward()
optimizer.step()
losses.append(loss_sum.cpu().detach().numpy()/train_set.data.shape[0])
print("epoch: ",i, "loss=", loss_sum.item())
acc = 0
with torch.no_grad():
for x, y in test_loader:
x = x.to(device)
y = y.to(device)
y_hat = net(x)
acc += torch.sum(y_hat.argmax(dim=1).type(y.dtype) == y)
print("测试集准确度",(acc/test_set.data.shape[0]).item())
plt.plot(range(epoch), losses)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
'''
epoch: 0 loss= 2160.221923828125
epoch: 1 loss= 790.5088500976562
epoch: 2 loss= 310.9482116699219
epoch: 3 loss= 251.4301300048828
epoch: 4 loss= 213.0938262939453
epoch: 5 loss= 185.2352294921875
epoch: 6 loss= 162.23760986328125
epoch: 7 loss= 138.5640869140625
epoch: 8 loss= 118.95980072021484
epoch: 9 loss= 102.10594177246094
测试集准确度 0.9230999946594238
'''
nin
调整NiN的超参数,以提高分类准确性。
依旧使用SGD,学习率调为0.12, epoch 15
epoch: 0 loss= 949.5301513671875
epoch: 1 loss= 541.0523071289062
epoch: 2 loss= 335.288818359375
epoch: 3 loss= 265.7752990722656
epoch: 4 loss= 236.0061492919922
epoch: 5 loss= 214.9103546142578
epoch: 6 loss= 200.35089111328125
epoch: 7 loss= 186.38710021972656
epoch: 8 loss= 173.8882293701172
epoch: 9 loss= 164.55499267578125
epoch: 10 loss= 157.58424377441406
epoch: 11 loss= 150.84255981445312
epoch: 12 loss= 145.72850036621094
epoch: 13 loss= 139.71893310546875
epoch: 14 loss= 134.76446533203125
测试集准确度 0.8855999708175659
为什么NiN块中有两个 1×1 卷积层?删除其中一个,然后观察和分析实验现象。
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
device = torch.device('cuda:0')
train_set = torchvision.datasets.FashionMNIST(
root='./dataMnist'
,train=True
,download=True
,transform=transforms.Compose([
transforms.ToTensor(),
transforms.Resize((224, 224))
])
)
train_loader = torch.utils.data.DataLoader(
train_set,
batch_size=128,
shuffle=True,
num_workers = 0
)
test_set = torchvision.datasets.FashionMNIST(
root='./dataMnist'
,train=False
,download=True
,transform=transforms.Compose([
transforms.ToTensor(),
transforms.Resize((224, 224))
])
)
test_loader = torch.utils.data.DataLoader(
test_set,
batch_size=128,
shuffle=True,
num_workers = 0
)
#创建Nin块
def Nin_block(in_channels, out_channels, padding, stride, kernel_size):
return nn.Sequential(
nn.Conv2d(in_channels, out_channels, padding=padding ,stride=stride, kernel_size=kernel_size),
nn.ReLU(),
nn.Conv2d(out_channels, out_channels, kernel_size=1),nn.ReLU(),
nn.Conv2d(out_channels, out_channels, kernel_size=1), nn.ReLU()
)
def Nin():
return nn.Sequential(
Nin_block(1, 96, stride=4, kernel_size=11, padding=0),
nn.MaxPool2d(kernel_size=3, stride=2),
Nin_block(96, 256, stride=1, kernel_size=5, padding=2),
nn.MaxPool2d(kernel_size=3, stride=2),
Nin_block(256, 384, stride=1, kernel_size=3, padding=1),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Dropout(0.5),
Nin_block(384, 10, stride=1, kernel_size=3, padding=1),
nn.AdaptiveAvgPool2d((1, 1)),
nn.Flatten() #去掉多余的维数
)
net = Nin()
# x = torch.zeros((1, 1, 224, 224))
# for layer in net:
# x = layer(x)
# print(layer.__class__.__name__, "\t输出的格式为: ", x.shape)
def init_weights(layer):
if type(layer)== nn.Linear or type(layer) == nn.Conv2d:
nn.init.xavier_uniform_(layer.weight) #初始化很重要,NiN随机初始化训练不动。。。
net.apply(init_weights)
print("Nin的结构为:", net)
optimizer = optim.SGD(net.parameters(), lr = 0.1)
loss = nn.CrossEntropyLoss(reduction='mean')
epoch = 10
losses = []
for i in range(epoch):
loss_sum = 0
for x, y in train_loader:
net = net.to(device)
x = x.to(device)
y = y.to(device)
y_hat = net(x)
loss_temp = loss(y_hat, y)
loss_sum += loss_temp
optimizer.zero_grad()
loss_temp.backward()
optimizer.step()
losses.append(loss_sum.cpu().detach().numpy()/train_set.data.shape[0])
print("epoch: ",i, "loss=", loss_sum.item())
acc = 0
with torch.no_grad():
for x, y in test_loader:
x = x.to(device)
y = y.to(device)
y_hat = net(x)
acc += torch.sum(y_hat.argmax(dim=1).type(y.dtype) == y)
print("测试集准确度",(acc/test_set.data.shape[0]).item())
plt.plot(range(epoch), losses)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
epoch: 0 loss= 993.8709716796875
epoch: 1 loss= 533.2142944335938
epoch: 2 loss= 361.1051330566406
epoch: 3 loss= 277.5993347167969
epoch: 4 loss= 232.45095825195312
epoch: 5 loss= 205.37686157226562
epoch: 6 loss= 187.60452270507812
epoch: 7 loss= 176.19105529785156
epoch: 8 loss= 165.3572540283203
epoch: 9 loss= 157.9745330810547
测试集准确度 0.8700000047683716
俩个可以多次融合通道信息
换成一个:
Nin的结构为: Sequential(
(0): Sequential(
(0): Conv2d(1, 96, kernel_size=(11, 11), stride=(4, 4))
(1): ReLU()
(2): Conv2d(96, 96, kernel_size=(1, 1), stride=(1, 1))
(3): ReLU()
)
(1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
(2): Sequential(
(0): Conv2d(96, 256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
(1): ReLU()
(2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
(3): ReLU()
)
(3): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
(4): Sequential(
(0): Conv2d(256, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): ReLU()
(2): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1))
(3): ReLU()
)
(5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
(6): Dropout(p=0.5, inplace=False)
(7): Sequential(
(0): Conv2d(384, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): ReLU()
(2): Conv2d(10, 10, kernel_size=(1, 1), stride=(1, 1))
(3): ReLU()
)
(8): AdaptiveAvgPool2d(output_size=(1, 1))
(9): Flatten(start_dim=1, end_dim=-1)
)
epoch: 0 loss= 847.5736694335938
epoch: 1 loss= 356.9827575683594
epoch: 2 loss= 261.1847839355469
epoch: 3 loss= 224.17762756347656
epoch: 4 loss= 196.07040405273438
epoch: 5 loss= 180.1049041748047
epoch: 6 loss= 171.22512817382812
epoch: 7 loss= 160.30470275878906
epoch: 8 loss= 153.3402099609375
epoch: 9 loss= 147.3529052734375
测试集准确度 0.8526999950408936
换成一个效果略低,俩个可以增加模型的非线性表达能力。
计算NiN的资源使用情况。参数的数量是多少?计算量是多少?
参数 96 ( 1 ∗ 11 ∗ 11 + 1 ) + ( 1 ∗ 1 ∗ 96 ∗ 96 ) ∗ 2 + 256 ∗ ( 96 ∗ 5 ∗ 5 + 1 ) + ( 1 ∗ 1 ∗ 256 ∗ 256 ) ∗ 2 + 384 ( 256 ∗ 3 ∗ 3 + 1 ) + ( 1 ∗ 1 ∗ 384 ∗ 384 ) ∗ 2 + 10 ( 384 ∗ 3 ∗ 3 + 1 ) + 2 ∗ ( 1 ∗ 1 ∗ 10 ∗ 10 ) = 1995284 96(1*11*11+1)+(1*1*96*96)*2+256*(96*5*5+1)+(1*1*256*256)*2+384(256*3*3+1)+(1*1*384*384)*2+10(384*3*3+1)+2*(1*1*10*10)=1995284 96(1∗11∗11+1)+(1∗1∗96∗96)∗2+256∗(96∗5∗5+1)+(1∗1∗256∗256)∗2+384(256∗3∗3+1)+(1∗1∗384∗384)∗2+10(384∗3∗3+1)+2∗(1∗1∗10∗10)=1995284
计算量: ( 54 ∗ 54 ∗ 96 ∗ 11 ∗ 11 ) + ( 54 ∗ 54 ∗ 96 ∗ 96 ∗ 96 ) ∗ 2 + ( 26 ∗ 26 ∗ 256 ∗ 5 ∗ 5 ) + ( 26 ∗ 26 ∗ 256 ∗ 256 ∗ 256 ) ∗ 2 + ( 12 ∗ 12 ∗ 384 ∗ 3 ∗ 3 ) + ( 12 ∗ 12 ∗ 384 ∗ 384 ∗ 384 ) + ( 10 ∗ 384 ∗ 5 ∗ 5 ∗ 3 ∗ 3 ) + ( 10 ∗ 5 ∗ 5 ∗ 10 ∗ 10 ) ∗ 2 = 2.469 ∗ 1 0 10 (54*54*96*11*11)+(54*54*96*96*96)*2+(26*26*256*5*5)+(26*26*256*256*256)*2+(12*12*384*3*3)+(12*12*384*384*384)+(10*384*5*5*3*3)+(10*5*5*10*10)*2 = 2.469*10^{10} (54∗54∗96∗11∗11)+(54∗54∗96∗96∗96)∗2+(26∗26∗256∗5∗5)+(26∗26∗256∗256∗256)∗2+(12∗12∗384∗3∗3)+(12∗12∗384∗384∗384)+(10∗384∗5∗5∗3∗3)+(10∗5∗5∗10∗10)∗2=2.469∗1010
一次性直接将 384×5×5 的表示缩减为 10×5×5 的表示,会存在哪些问题?
信息损失过大?
googlenet
训练结果
epoch: 0 loss= 1633.828857421875
epoch: 1 loss= 803.0449829101562
epoch: 2 loss= 605.2086181640625
epoch: 3 loss= 472.7867126464844
epoch: 4 loss= 409.02154541015625
epoch: 5 loss= 368.7477722167969
epoch: 6 loss= 339.0345458984375
epoch: 7 loss= 315.32861328125
epoch: 8 loss= 297.1580810546875
epoch: 9 loss= 287.3929748535156
测试集准确度 0.880899965763092
使用GoogLeNet的最小图像大小是多少?
只有前面的block1缩小了图片的大小,则最小图片大小为55,经过77卷积变33,再经过3 * 3的池化变11.
将AlexNet、VGG和NiN的模型参数大小与GoogLeNet进行比较。后两个网络架构是如何显著减少模型参数大小的?
NiN是采用11卷积替换全连接,VGG则是用多个小卷积核替代大的卷积核。(Googlenet也采用了11先降低通道维数减少计算参数的方式)
batch-norm
在使用批量规范化之前,我们是否可以从全连接层或卷积层中删除偏置参数?为什么?
我认为是可以的,偏执最终也会被作为平均值中的一部分然后被减掉。
比较LeNet在使用和不使用批量规范化情况下的学习率
使用 :1.0 准确度 85.7%
不使用: 0.3 73.3%
我们是否需要在每个层中进行批量规范化?尝试一下?
只留了最后俩个batch-norm
lr:0.3 51.3% 测试集准确度大起大落不稳定
只留了前俩个batch-norm
lr:0.3 80.9% 测试集准确度较为稳定,说明相较于加在后面还是应该加在前面,可以确保数值稳定性。
你可以通过批量规范化来替换暂退法吗?行为会如何改变?
net = nn.Sequential(
nn.Conv2d(1, 6, kernel_size=5), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Conv2d(6, 16, kernel_size=5), nn.Dropout(0.5), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2), nn.Flatten(),
nn.Linear(256, 120), nn.Sigmoid(),
nn.Linear(120, 84), nn.Sigmoid(),
nn.Linear(84, 10))
lr:0.3 测试准确度 60.6%
resnet
numref:fig_inception中的Inception块与残差块之间的主要区别是什么?在删除了Inception块中的一些路径之后,它们是如何相互关联的?
个人所见,尽管俩个网络的构建思路不同,Inception是为了使用不同的卷积核去提取不同的特征(同时使用少的参数),但是Resnet模块是为了保证更深的网络表达的函数囊括浅层的函数,使得更深的网络有意义。但是在架构上,可以将Resnet看作是一种特殊的Inception模块。
参考ResNet论文 :cite:He.Zhang.Ren.ea.2016中的表1,以实现不同的变体。
贴一下Resnet 18的训练结果
epoch: 0 loss= 274.33294677734375
epoch: 1 loss= 123.11670684814453
epoch: 2 loss= 98.39700317382812
epoch: 3 loss= 81.01783752441406
epoch: 4 loss= 65.5454330444336
epoch: 5 loss= 51.666439056396484
epoch: 6 loss= 37.66799545288086
epoch: 7 loss= 27.368988037109375
epoch: 8 loss= 18.34890365600586
epoch: 9 loss= 10.552614212036133
epoch: 10 loss= 9.15225601196289
epoch: 11 loss= 5.566713809967041
epoch: 12 loss= 2.6198325157165527
epoch: 13 loss= 0.49261292815208435
epoch: 14 loss= 0.2144828885793686
测试集准确度 0.9327999949455261
实现一下Resnet-34:
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
device = torch.device('cuda:0')
train_set = torchvision.datasets.FashionMNIST(
root='./dataMnist'
,train=True
,download=True
,transform=transforms.Compose([
transforms.ToTensor(),
transforms.Resize((224, 224))
])
)
train_loader = torch.utils.data.DataLoader(
train_set,
batch_size=128,
shuffle=True,
num_workers = 0
)
test_set = torchvision.datasets.FashionMNIST(
root='./dataMnist'
,train=False
,download=True
,transform=transforms.Compose([
transforms.ToTensor(),
transforms.Resize((224, 224))
])
)
test_loader = torch.utils.data.DataLoader(
test_set,
batch_size=128,
shuffle=True,
num_workers = 0
)
#创建Resnet_block
class ResBlock(nn.Module):
def __init__(self, in_channels, out_channels, b2=False, first_block = True):
super().__init__()
if(first_block and not b2):
stride = 2
self.conv_one = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, padding=0)
else:
self.conv_one = None
stride = 1
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
self.bn1 = nn.BatchNorm2d(out_channels)
self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
self.bn2 = nn.BatchNorm2d(out_channels)
def forward(self, x):
y = F.relu(self.bn1(self.conv1(x)))
y = self.bn2(self.conv2(y))
if(self.conv_one):
out = F.relu(y + self.conv_one(x))
else:
out = F.relu(y + x)
return out
def ResBlocks(nums, b2, in_channels, out_channels):
block = []
for i in range(nums):
if(i == 0):
block.append(ResBlock(in_channels, out_channels, b2, first_block = True))
else:
block.append(ResBlock(out_channels, out_channels, b2, first_block = False))
return nn.Sequential(*block)
b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
b2 = ResBlocks(3, True, 64, 64)
b3 = ResBlocks(4, False, 64, 128)
b4 = ResBlocks(6, False, 128, 256)
b5 = ResBlocks(3, False, 256, 512)
net = nn.Sequential(b1, b2, b3, b4, b5, nn.AdaptiveAvgPool2d((1,1)), nn.Flatten(), nn.Linear(512, 10))
x = torch.zeros((1, 1, 224, 224))
for layer in net:
x = layer(x)
print(layer.__class__.__name__, "\t输出的形状为:", x.shape)
def init_weights(layer):
if type(layer)== nn.Linear or type(layer) == nn.Conv2d:
nn.init.xavier_uniform_(layer.weight) #初始化很重要,NiN随机初始化训练不动。。。
net.apply(init_weights)
print("Resnet18的结构为:", net)
optimizer = optim.SGD(net.parameters(), lr = 0.12)
loss = nn.CrossEntropyLoss(reduction='mean')
epoch = 10
losses = []
for i in range(epoch):
loss_sum = 0
for x, y in train_loader:
net = net.to(device)
x = x.to(device)
y = y.to(device)
y_hat = net(x)
loss_temp = loss(y_hat, y)
loss_sum += loss_temp
optimizer.zero_grad()
loss_temp.backward()
optimizer.step()
losses.append(loss_sum.cpu().detach().numpy()/train_set.data.shape[0])
print("epoch: ",i, "loss=", loss_sum.item())
acc = 0
with torch.no_grad():
for x, y in test_loader:
x = x.to(device)
y = y.to(device)
y_hat = net(x)
acc += torch.sum(y_hat.argmax(dim=1).type(y.dtype) == y)
print("测试集准确度",(acc/test_set.data.shape[0]).item())
plt.plot(range(epoch), losses)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()
epoch: 0 loss= 379.17254638671875
epoch: 1 loss= 152.3948211669922
epoch: 2 loss= 121.77942657470703
epoch: 3 loss= 103.70003509521484
epoch: 4 loss= 89.28633117675781
epoch: 5 loss= 75.78622436523438
epoch: 6 loss= 62.75402069091797
epoch: 7 loss= 53.15045928955078
epoch: 8 loss= 42.1845703125
epoch: 9 loss= 34.198307037353516
测试集准确度 0.9125999808311462
对于更深层次的网络,ResNet引入了“bottleneck”架构来降低模型复杂性。请你试着去实现它。
x = torch.zeros((1, 1, 224, 224))
class bottleneck(nn.Module):
def __init__(self, c_num, conv_skip = True, stride = 1):
super().__init__()
self.conv_layer = nn.Sequential(
nn.Conv2d(c_num[0], c_num[1], kernel_size=1, padding=0, stride=1),
nn.BatchNorm2d(c_num[1]),
nn.ReLU(),
nn.Conv2d(c_num[1], c_num[1], kernel_size=3, padding=1, stride=stride),
nn.BatchNorm2d(c_num[1]),
nn.ReLU(),
nn.Conv2d(c_num[1], c_num[2], kernel_size=1, padding=0, stride=1),
nn.BatchNorm2d(c_num[2]),
nn.ReLU())
if(conv_skip):
self.conv_skip = nn.Conv2d(c_num[0], c_num[2], kernel_size=1, padding=0, stride=stride)
else:
self.conv_skip = None
def forward(self, x):
y = self.conv_layer(x)
if(self.conv_skip):
out = y + self.conv_skip(x)
else:
out = y + x
return out
def bottle_block(block_num, c_num, b2 = False):
block = []
for i in range(block_num):
if(i == 0 and not b2):
block.append(bottleneck(c_num, True, stride=2))
else:
block.append(bottleneck([c_num[2], c_num[1], c_num[2]], False))
return nn.Sequential(*block)
b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
b2 = bottle_block(3, [64, 64, 256])
b3 = bottle_block(4, [256, 128, 512])
b4 = bottle_block(6, [512, 256, 1024])
b5 = bottle_block(3, [1024, 512, 2048])
net = nn.Sequential(b1, b2, b3, b4, b5, nn.AdaptiveAvgPool2d((1,1)), nn.Flatten(), nn.Linear(2048, 10))
for layer in net:
x = layer(x)
print(layer.__class__.__name__, "\t输出的形状为:", x.shape)
Sequential 输出的形状为: torch.Size([1, 64, 56, 56])
Sequential 输出的形状为: torch.Size([1, 256, 28, 28])
Sequential 输出的形状为: torch.Size([1, 512, 14, 14])
Sequential 输出的形状为: torch.Size([1, 1024, 7, 7])
Sequential 输出的形状为: torch.Size([1, 2048, 4, 4])
AdaptiveAvgPool2d 输出的形状为: torch.Size([1, 2048, 1, 1])
Flatten 输出的形状为: torch.Size([1, 2048])
Linear 输出的形状为: torch.Size([1, 10])
为什么即使函数类是嵌套的,我们仍然要限制增加函数的复杂性呢?
- 如无必要,勿增实体,拟合能力过强的函数会导致过拟合。
- 复杂的函数会消耗大量的计算资源
- 可解释性会变差
Densenet
先训练一下Densenet
epoch: 0 loss= 574.0405883789062
epoch: 1 loss= 304.017578125
epoch: 2 loss= 242.1274871826172
epoch: 3 loss= 208.28538513183594
epoch: 4 loss= 185.4270782470703
epoch: 5 loss= 170.85366821289062
epoch: 6 loss= 156.8195343017578
epoch: 7 loss= 142.7688446044922
epoch: 8 loss= 132.4578857421875
epoch: 9 loss= 120.2202377319336
epoch: 10 loss= 110.72298431396484
epoch: 11 loss= 103.44509887695312
epoch: 12 loss= 94.1830825805664
epoch: 13 loss= 88.94744110107422
epoch: 14 loss= 80.26952362060547
测试集准确度 0.877299964427948
为什么我们在过渡层使用平均汇聚层而不是最大汇聚层?
将平均汇聚改为最大汇聚:
def conv_dense(in_channels, out_channels):
return nn.Sequential(
nn.BatchNorm2d(in_channels),
nn.ReLU(),
nn.Conv2d(in_channels, 128, kernel_size=1, padding=0, stride=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.Conv2d(128, out_channels, kernel_size=3, stride=1, padding=1))
class DesBlock(nn.Module):
def __init__(self, nums, in_channels, out_channels):
super().__init__()
self.blocks =[]
self.blocks = self.blocks
for i in range(nums):
self.blocks.append(conv_dense(i*out_channels + in_channels, out_channels))
self.net = nn.Sequential(*self.blocks)
def forward(self, x):
for block in self.net:
y = block(x)
x = torch.cat((x, y), dim=1)
return x
def transition(in_channels):
out_channels = in_channels//2
return nn.Sequential(
nn.BatchNorm2d(in_channels),nn.ReLU(),
nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0, stride=1),
nn.MaxPool2d(kernel_size=2, stride=2)
)
epoch: 0 loss= 515.3784790039062
epoch: 1 loss= 282.50244140625
epoch: 2 loss= 232.67477416992188
epoch: 3 loss= 202.9937286376953
epoch: 4 loss= 178.63426208496094
epoch: 5 loss= 161.4530029296875
epoch: 6 loss= 147.14990234375
epoch: 7 loss= 129.9204864501953
epoch: 8 loss= 117.783203125
epoch: 9 loss= 106.5365219116211
epoch: 10 loss= 94.76472473144531
epoch: 11 loss= 87.81078338623047
epoch: 12 loss= 78.73601531982422
epoch: 13 loss= 69.39637756347656
epoch: 14 loss= 62.99326705932617
测试集准确度 0.9085999727249146
准确度更高。
DenseNet的优点之一是其模型参数比ResNet小。为什么呢?
重复使用了特征图,使得无需重复学习的参数。
DenseNet一个诟病的问题是内存或显存消耗过多。真的是这样吗?可以把输入形状换成 224×224 ,来看看实际的显存消耗。
是这样的,可能因为要保存许多的特征图,6G显存不太够,需要降低batchsize。