残差网络(ResNet)
加更多的层总是改进精度吗?
ResNet核心的思想就是让我们加上更多的层至少不会变的差
残差块
串联一个层改变函数类,我们希望能扩大函数类 残差块加入快速通道(右边)来得到
f
(
x
)
=
x
+
g
(
x
)
f(x)=x+g(x)
f ( x ) = x + g ( x ) 的结构。这个意思就是将x加上一个捷径到输出,这样就算我们新加的层什么也没有学到,我们也可以得到前面的x。
ResNet块细节
下图左边的是直接将x加入到输出,但是当通道数目改变的话,左边的方法就不行了,我们可以在捷径上面加上一个1 * 1的卷积来融合通道
不同的残差块
ResNet块
高宽减半ResNet块(步幅为2) 后接多个高宽不变的RestNet块
ResNet架构
总结
代码实现
import torch
from torch import nn
from torch. nn import functional as F
from d2l import torch as d2l
class Residual ( nn. Module) :
def __init__ ( self, input_channels, num_channels, use_1x1conv= False , strides= 1 ) :
super ( ) . __init__( )
self. conv1 = nn. Conv2d( input_channels, num_channels, kernel_size= 3 , padding= 1 , stride= strides)
self. conv2 = nn. Conv2d( num_channels, num_channels, kernel_size= 3 , padding= 1 )
if use_1x1conv:
self. conv3 = nn. Conv2d( input_channels, num_channels, kernel_size= 1 , stride= strides)
else :
self. conv3 = None
self. bn1 = nn. BatchNorm2d( num_channels)
self. bn2 = nn. BatchNorm2d( num_channels)
self. relu = nn. ReLU( inplace= True )
def forward ( self, X) :
Y = F. relu( self. bn1( self. conv1( X) ) )
Y = self. bn2( self. conv2( Y) )
if self. conv3:
X = self. conv3( X)
Y += X
return F. relu( Y)
blk = Residual( 3 , 3 )
X = torch. rand( 4 , 3 , 6 , 6 )
Y = blk( X)
Y. shape
torch.Size([4, 3, 6, 6])
blk = Residual( 3 , 6 , use_1x1conv= True , strides= 2 )
X = torch. rand( 4 , 3 , 6 , 6 )
blk( X) . shape
torch.Size([4, 6, 3, 3])
b1 = nn. Sequential( nn. Conv2d( 1 , 64 , kernel_size= 7 , stride= 2 , padding= 3 ) ,
nn. BatchNorm2d( 64 ) , nn. ReLU( ) ,
nn. MaxPool2d( kernel_size= 3 , stride= 2 , padding= 1 ) )
def resnet_block ( input_channels, num_channels, num_residuals,
first_block= False ) :
blk = [ ]
for i in range ( num_residuals) :
if i == 0 and not first_block:
blk. append(
Residual( input_channels, num_channels, use_1x1conv= True ,
strides= 2 ) )
else :
blk. append( Residual( num_channels, num_channels) )
return blk
b2 = nn. Sequential( * resnet_block( 64 , 64 , 2 , first_block= True ) )
b3 = nn. Sequential( * resnet_block( 64 , 128 , 2 ) )
b4 = nn. Sequential( * resnet_block( 128 , 256 , 2 ) )
b5 = nn. Sequential( * resnet_block( 256 , 512 , 2 ) )
net = nn. Sequential( b1, b2, b3, b4, b5, nn. AdaptiveAvgPool2d( ( 1 , 1 ) ) ,
nn. Flatten( ) , nn. Linear( 512 , 10 ) )
X = torch. rand( size= ( 1 , 1 , 224 , 224 ) )
for layer in net:
X = layer( X)
print ( layer. __class__. __name__, 'output shape:\t' , X. shape)
Sequential output shape: torch.Size([1, 64, 56, 56])
Sequential output shape: torch.Size([1, 64, 56, 56])
Sequential output shape: torch.Size([1, 128, 28, 28])
Sequential output shape: torch.Size([1, 256, 14, 14])
Sequential output shape: torch.Size([1, 512, 7, 7])
AdaptiveAvgPool2d output shape: torch.Size([1, 512, 1, 1])
Flatten output shape: torch.Size([1, 512])
Linear output shape: torch.Size([1, 10])
/Users/tiger/opt/anaconda3/envs/d2l-zh/lib/python3.8/site-packages/torch/nn/functional.py:718: UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at ../c10/core/TensorImpl.h:1156.)
return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
lr, num_epochs, batch_size = 0.05 , 10 , 256
train_iter, test_iter = d2l. load_data_fashion_mnist( batch_size, resize= 96 )
d2l. train_ch6( net, train_iter, test_iter, num_epochs, lr, d2l. try_gpu( ) )
训练结果
import torch
from torch import nn
from torch. nn import functional as F
from d2l import torch as d2l
class Residual ( nn. Module) :
def __init__ ( self, input_channels, num_channels, use_1x1conv= False , strides= 1 ) :
super ( ) . __init__( )
self. conv1 = nn. Conv2d( input_channels, num_channels, kernel_size= 3 , padding= 1 , stride= strides)
self. conv2 = nn. Conv2d( num_channels, num_channels, kernel_size= 3 , padding= 1 )
if use_1x1conv:
self. conv3 = nn. Conv2d( input_channels, num_channels, kernel_size= 1 , stride= strides)
else :
self. conv3 = None
self. bn1 = nn. BatchNorm2d( num_channels)
self. bn2 = nn. BatchNorm2d( num_channels)
self. relu = nn. ReLU( inplace= True )
def forward ( self, X) :
Y = F. relu( self. bn1( self. conv1( X) ) )
Y = self. bn2( self. conv2( Y) )
if self. conv3:
X = self. conv3( X)
Y += X
return F. relu( Y)
b1 = nn. Sequential( nn. Conv2d( 1 , 64 , kernel_size= 7 , stride= 2 , padding= 3 ) ,
nn. BatchNorm2d( 64 ) , nn. ReLU( ) ,
nn. MaxPool2d( kernel_size= 3 , stride= 2 , padding= 1 ) )
def resnet_block ( input_channels, num_channels, num_residuals,
first_block= False ) :
blk = [ ]
for i in range ( num_residuals) :
if i == 0 and not first_block:
blk. append(
Residual( input_channels, num_channels, use_1x1conv= True ,
strides= 2 ) )
else :
blk. append( Residual( num_channels, num_channels) )
return blk
b2 = nn. Sequential( * resnet_block( 64 , 64 , 2 , first_block= True ) )
b3 = nn. Sequential( * resnet_block( 64 , 128 , 2 ) )
b4 = nn. Sequential( * resnet_block( 128 , 256 , 2 ) )
b5 = nn. Sequential( * resnet_block( 256 , 512 , 2 ) )
net = nn. Sequential( b1, b2, b3, b4, b5, nn. AdaptiveAvgPool2d( ( 1 , 1 ) ) ,
nn. Flatten( ) , nn. Linear( 512 , 10 ) )
lr, num_epochs, batch_size = 0.05 , 10 , 256
train_iter, test_iter = d2l. load_data_fashion_mnist( batch_size, resize= 96 )
d2l. train_ch6( net, train_iter, test_iter, num_epochs, lr, d2l. try_gpu( ) )
loss 0.017, train acc 0.995, test acc 0.918
371.7 examples/sec on cuda:0