目 录
全文较长认真读完需要20分钟,分析网络结构再需要1小时。
一、ShuffleNet_v1
1. ShuffleNet 基本单元
图(a) 是传统的ResNet残差结构,首先是1x1卷积、接着3x3卷积 最后是1x1卷积 调整通道参数,进行短路连接。
图(b)将密集的1x1卷积替换成1x1的group convolution,不过在第一个1x1卷积之后增加了一个channel shuffle操作。值得注意的是3x3卷积后面没有增加channel shuffle,按paper的意思,对于这样一个残差单元,一个channel shuffle操作是足够了。还有就是3x3的depthwise convolution之后没有使用ReLU激活函数。
图(c)展示了其他改进,对原输入采用stride=2的3x3 avg pool,在depthwise convolution卷积处取stride=2保证两个通路shape相同,然后将得到特征图与输出进行连接(concat,借鉴了DenseNet?),而不是相加。极致的降低计算量与参数大小。
2. ShuffleNet 网络结构
从上图可以看到,最开始使用3x3的卷积和max Pool层 ,然后是3个Stage ,对于每个阶段,第一个基本单元采用的是stride=2,这样特征图width和height各降低一半,而通道数增加一倍。后面的基本单元都是stride=1,特征图和通道数都保持不变。对于基本单元来说,其中瓶颈层,就是3x3卷积层的通道数为输出通道数的1/4,这和残差单元的设计理念是一样的。
3.ShuffleNet 代码
'''ShuffleNet in PyTorch.
See the paper "ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices" for more details.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
def conv_bn(inp, oup, stride):
return nn.Sequential(
nn.Conv3d(inp, oup, kernel_size=3, stride=stride, padding=(1,1,1), bias=False),
nn.BatchNorm3d(oup),
nn.ReLU(inplace=True)
)
def channel_shuffle(x, groups):
'''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]'''
batchsize, num_channels, depth, height, width = x.data.size()
channels_per_group = num_channels // groups #返回最大整数
# reshape
x = x.view(batchsize, groups,
channels_per_group, depth, height, width)
#permute
x = x.permute(0,2,1,3,4,5).contiguous()
# flatten
x = x.view(batchsize, num_channels, depth, height, width)
return x
class Bottleneck(nn.Module):
def __init__(self, in_planes, out_planes, stride, groups):
super(Bottleneck, self).__init__()
self.stride = stride
self.groups = groups
mid_planes = out_planes//4
if self.stride == 2:
out_planes = out_planes - in_planes
g = 1 if in_planes==24 else groups #[24, 240, 480, 960]
self.conv1 = nn.Conv3d(in_planes, mid_planes, kernel_size=1, groups=g, bias=False)
self.bn1 = nn.BatchNorm3d(mid_planes)
self.conv2 = nn.Conv3d(mid_planes, mid_planes, kernel_size=3, stride=stride, padding=1, groups=mid_planes, bias=False)
self.bn2 = nn.BatchNorm3d(mid_planes)
self.conv3 = nn.Conv3d(mid_planes, out_planes, kernel_size=1, groups=groups, bias=False)
self.bn3 = nn.BatchNorm3d(out_planes)
self.relu = nn.ReLU(inplace=True)
if stride == 2:
self.shortcut = nn.AvgPool3d(kernel_size=(2,3,3), stride=2, padding=(0,1,1))
def forward(self, x):
#print('c',x.shape)
out = self.relu(self.bn1(self.conv1(x)))
#print('c1', out.shape)
out = channel_shuffle(out, self.groups)
#print('c2', out.shape)
out = self.bn2(self.conv2(out))
#print('c2', out.shape)
out = self.bn3(self.conv3(out))
print('c',out.shape)
if self.stride == 2:
out = self.relu(torch.cat([out, self.shortcut(x)], 1))
#print('c1',out.shape)
else:
out = self.relu(out + x)
#print('c2',out.shape)
return out
class ShuffleNet(nn.Module):
def __init__(self,
groups=3,
width_mult=1,
num_classes=3):
super(ShuffleNet, self).__init__()
self.num_classes = num_classes
self.groups = groups
num_blocks = [4,8,4]
# index 0 is invalid and should never be called.
# only used for indexing convenience.
if groups == 1:
out_planes = [24, 144, 288, 567]
elif groups == 2:
out_planes = [24, 200, 400, 800]
elif groups == 3:
out_planes = [24, 240, 480, 960]
elif groups == 4:
out_planes = [24, 272, 544, 1088]
elif groups == 8:
out_planes = [24, 384, 768, 1536]
else:
raise ValueError(
"""{} groups is not supported for
1x1 Grouped Convolutions""".format(self.groups))
out_planes = [int(i * width_mult) for i in out_planes] #[24, 240, 480, 960]
self.in_planes = out_planes[0]
self.conv1 = conv_bn(3, self.in_planes, stride=(1,2,2))
self.maxpool = nn.MaxPool3d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(out_planes[1], num_blocks[0], self.groups)
self.layer2 = self._make_layer(out_planes[2], num_blocks[1], self.groups)
self.layer3 = self._make_layer(out_planes[3], num_blocks[2], self.groups)
# building classifier
self.classifier = nn.Sequential(
nn.Dropout(0.2),
nn.Linear(out_planes[3], self.num_classes)
)
def _make_layer(self, out_planes, num_blocks, groups):
layers = []
for i in range(num_blocks): #[4,8,4] #[24, 240, 480, 960]
stride = 2 if i == 0 else 1
layers.append(Bottleneck(self.in_planes, out_planes, stride=stride, groups=groups))
self.in_planes = out_planes
return nn.Sequential(*layers)
def forward(self, x):
#print(x.shape)
out = self.conv1(x)
#print(out.shape)
out = self.maxpool(out)
#print(out.shape)
out = self.layer1(out)
#print(out.shape)
out = self.layer2(out)
#print(out.shape)
out = self.layer3(out)
#print(out.shape)
out = F.avg_pool3d(out, out.data.size()[-3:])
#print(out.shape)
out = out.view(out.size(0), -1)
#print(out.shape)
out = self.classifier(out)
return out
def get_fine_tuning_parameters(model, ft_portion):
if ft_portion == "complete":
return model.parameters()
elif ft_portion == "last_layer":
ft_module_names = []
ft_module_names.append('classifier')
parameters = []
for k, v in model.named_parameters():
for ft_module in ft_module_names:
if ft_module in k:
parameters.append({'params': v})
break
else:
parameters.append({'params': v, 'lr': 0.0})
return parameters
else:
raise ValueError("Unsupported ft_portion: 'complete' or 'last_layer' expected")
def get_model(**kwargs):
"""
Returns the model.
"""
model = ShuffleNet(**kwargs)
return model
if __name__ == "__main__":
model = get_model(groups=3, num_classes=3, width_mult=1)
model = model.cuda()
model = nn.DataParallel(model, device_ids=None)
print(model)
input_var = Variable(torch.randn(8, 3, 16, 112, 112))
output = model(input_var)
print(output.shape)
二、ShuffleNet_v2
1.对比v1与v2版本的基础单元
(a)(b)是v1版本的两种不同的block结构,两者的差别在于后者对特征图尺寸做了缩小
(c)(d)是v2版本的两种不同的block结构,stride =1 时增加 channnel split,stride = 2时,另增加卷积分支
- 从(a)和(c)的对比可以看出首先(c)在开始处增加了一个channel split操作,这个操作将输入特征的通道分成c-c’和c’,c’在文章中采用c/2,这主要是和第1点发现对应
- 然后(c)中取消了1*1卷积层中的group操作,这和第2点发现对应,同时前面的channel split其实已经算是变相的group操作了
- channel shuffle的操作移到了concat后面,和第3点发现对应,同时也是因为第一个1*1卷积层没有group操作,所以在其后面跟channel shuffle也没有太大必要
- 最后是将element-wise add操作替换成concat,这个和第4点发现对应。
多个(c)结构连接在一起的话,channel split、concat和channel shuffle是可以合并在一起的。(b)和(d)的对比也是同理,只不过因为(d)的开始处没有channel split操作,所以最后concat后特征图通道数翻倍,可以结合后面具体网络结构来看:
2.ShuffleNet_v2网络结构
3.ShuffleNet_v2 代码
'''ShuffleNetV2 in PyTorch.
See the paper "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" for more details.
'''
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from collections import OrderedDict
from torch.nn import init
import math
def conv_bn(inp, oup, stride):
return nn.Sequential(
nn.Conv3d(inp, oup, kernel_size=3, stride=stride, padding=(1,1,1), bias=False),
nn.BatchNorm3d(oup),
nn.ReLU(inplace=True)
)
def conv_1x1x1_bn(inp, oup):
return nn.Sequential(
nn.Conv3d(inp, oup, 1, 1, 0, bias=False),
nn.BatchNorm3d(oup),
nn.ReLU(inplace=True)
)
def channel_shuffle(x, groups):
'''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]'''
batchsize, num_channels, depth, height, width = x.data.size()
channels_per_group = num_channels // groups
# reshape
x = x.view(batchsize, groups,
channels_per_group, depth, height, width)
#permute
x = x.permute(0,2,1,3,4,5).contiguous()
# flatten
x = x.view(batchsize, num_channels, depth, height, width)
return x
class InvertedResidual(nn.Module):
def __init__(self, inp, oup, stride):
super(InvertedResidual, self).__init__()
self.stride = stride
assert stride in [1, 2]
oup_inc = oup//2
if self.stride == 1:
#assert inp == oup_inc
self.banch2 = nn.Sequential(
# pw
nn.Conv3d(oup_inc, oup_inc, 1, 1, 0, bias=False),
nn.BatchNorm3d(oup_inc),
nn.ReLU(inplace=True),
# dw
nn.Conv3d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False),
nn.BatchNorm3d(oup_inc),
# pw-linear
nn.Conv3d(oup_inc, oup_inc, 1, 1, 0, bias=False),
nn.BatchNorm3d(oup_inc),
nn.ReLU(inplace=True),
)
else:
self.banch1 = nn.Sequential(
# dw
nn.Conv3d(inp, inp, 3, stride, 1, groups=inp, bias=False),
nn.BatchNorm3d(inp),
# pw-linear
nn.Conv3d(inp, oup_inc, 1, 1, 0, bias=False),
nn.BatchNorm3d(oup_inc),
nn.ReLU(inplace=True),
)
self.banch2 = nn.Sequential(
# pw
nn.Conv3d(inp, oup_inc, 1, 1, 0, bias=False),
nn.BatchNorm3d(oup_inc),
nn.ReLU(inplace=True),
# dw
nn.Conv3d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False),
nn.BatchNorm3d(oup_inc),
# pw-linear
nn.Conv3d(oup_inc, oup_inc, 1, 1, 0, bias=False),
nn.BatchNorm3d(oup_inc),
nn.ReLU(inplace=True),
)
@staticmethod
def _concat(x, out):
# concatenate along channel axis
return torch.cat((x, out), 1)
def forward(self, x):
if self.stride == 1:
x1 = x[:, :(x.shape[1]//2), :, :, :]
x2 = x[:, (x.shape[1]//2):, :, :, :]
out = self._concat(x1, self.banch2(x2))
elif self.stride == 2:
out = self._concat(self.banch1(x), self.banch2(x))
return channel_shuffle(out, 2)
class ShuffleNetV2(nn.Module):
def __init__(self, num_classes=3, sample_size=112, width_mult=1.):
super(ShuffleNetV2, self).__init__()
assert sample_size % 16 == 0
self.stage_repeats = [4, 8, 4]
# index 0 is invalid and should never be called.
# only used for indexing convenience.
if width_mult == 0.25:
self.stage_out_channels = [-1, 24, 32, 64, 128, 1024]
elif width_mult == 0.5:
self.stage_out_channels = [-1, 24, 48, 96, 192, 1024]
elif width_mult == 1.0:
self.stage_out_channels = [-1, 24, 116, 232, 464, 1024]
elif width_mult == 1.5:
self.stage_out_channels = [-1, 24, 176, 352, 704, 1024]
elif width_mult == 2.0:
self.stage_out_channels = [-1, 24, 224, 488, 976, 2048]
else:
raise ValueError(
"""{} groups is not supported for
1x1 Grouped Convolutions""".format(num_groups))
# building first layer
input_channel = self.stage_out_channels[1]
self.conv1 = conv_bn(3, input_channel, stride=(1,2,2))
self.maxpool = nn.MaxPool3d(kernel_size=3, stride=2, padding=1)
self.features = []
# building inverted residual blocks
for idxstage in range(len(self.stage_repeats)):
numrepeat = self.stage_repeats[idxstage]
output_channel = self.stage_out_channels[idxstage+2]
for i in range(numrepeat):
stride = 2 if i == 0 else 1
self.features.append(InvertedResidual(input_channel, output_channel, stride))
input_channel = output_channel
# make it nn.Sequential
self.features = nn.Sequential(*self.features)
# building last several layers
self.conv_last = conv_1x1x1_bn(input_channel, self.stage_out_channels[-1])
# building classifier
self.classifier = nn.Sequential(
nn.Dropout(0.2),
nn.Linear(self.stage_out_channels[-1], num_classes)
)
def forward(self, x):
out = self.conv1(x)
out = self.maxpool(out)
out = self.features(out)
out = self.conv_last(out)
out = F.avg_pool3d(out, out.data.size()[-3:])
out = out.view(out.size(0), -1)
out = self.classifier(out)
return out
def get_fine_tuning_parameters(model, ft_portion):
if ft_portion == "complete":
return model.parameters()
elif ft_portion == "last_layer":
ft_module_names = []
ft_module_names.append('classifier')
parameters = []
for k, v in model.named_parameters():
for ft_module in ft_module_names:
if ft_module in k:
parameters.append({'params': v})
break
else:
parameters.append({'params': v, 'lr': 0.0})
return parameters
else:
raise ValueError("Unsupported ft_portion: 'complete' or 'last_layer' expected")
def get_model(**kwargs):
"""
Returns the model.
"""
model = ShuffleNetV2(**kwargs)
return model
if __name__ == "__main__":
model = get_model(num_classes=3, sample_size=112, width_mult=1.)
model = model.cuda()
model = nn.DataParallel(model, device_ids=None)
print(model)
input_var = Variable(torch.randn(8, 3, 16, 112, 112))
output = model(input_var)
print(output.shape)