文章目录
简介
随着卷积神经网络的发展,网络的效果越来越好,但网络的规模和参数量也越来越大!
大参数必定要消耗大量的计算资源。考虑到应用层面,一般要把模型放到计算能力没有pc那那么强的手机或边缘设备。就需要保证在精度可以接收的范围下,速度得快的高效网络模型。如在手机上的应用、汽车自动驾驶的应用等等…
轻量化模型的好处:不需要网络把数据传输到云服务器得到处理结果,能实时的进行模型推理,满足自动驾驶等对实时性要求高的需求。且边缘计算保证了数据的隐私性,防止了云端推理可能被截取的可能。
基于以上场景,Google在2017年提出了轻量高效的模型MobileNet。
分析
模型要实时,推理速度就得快,传统的卷积神经网络中卷积层的推理时间是占据了大部分时间如图:
https://www2.eecs.berkeley.edu/Pubs/TechRpts/2014/EECS-2014-93.pdf
Batch越大卷积层的耗时越大,所以轻量化网络一般对卷积层进行优化
NasNet利用强化学习去针对某个数据集生成一个轻量化网络
单位模型参数量效率
通过深度可分离卷积对宽度信息/空间信息和深度信息/跨通道信息的解耦,来分步提取特征。
深度可分离卷积可以看成是分组卷积的特例。
分组卷积是一个卷积核处理一组channel,只不过深度可分离卷积处理的那一组channel数为1。即每个卷积核只处理一个channel!
计算消耗
普通卷积
一次卷积所需要的乘法次数:卷积核的大小x通道数M ,一次卷积运算对应一个feature map的像素值。所以总的乘法计算量要乘上feature map 的元素个数。
深度可分离卷积
高效的卷积运算
每一次卷积运算的元素拉平成一个vector拼接在一起。
直接把卷积核拉平做矩阵乘,一次算完
MobileNetV1(CVPR2017)
深度可分离卷积(核心)
pytorch实现
class DWConvBlock(nn.Module):
def __init__(self,in_channel,out_channel,
kernel_size,stride=1,padding=0):
super(DWConvBlock, self).__init__()
self.dconv = nn.Conv2d(in_channel,in_channel,
kernel_size,stride=stride,
padding=padding)
self.bn1 = nn.BatchNorm2d(in_channel)
self.relu1 = nn.ReLU(inplace=True)
self.pconv = nn.Conv2d(in_channel,out_channel,1,1)
self.bn2 = nn.BatchNorm2d(out_channel)
self.relu2 = nn.ReLU(inplace=True)
def forward(self,x):
x = self.dconv(x)
x = self.bn1(x)
x = self.relu1(x)
x = self.pconv(x)
x = self.bn2(x)
x = self.relu2(x)
return x
网络架构
Pytorch实现
class MobileNet_V1(nn.Module):
def __init__(self,num_class=5):
super(MobileNet_V1, self).__init__()
"""
input_shape:224x224x3
"""
self.conv1 = nn.Conv2d(3,32,3,2,1)
self.relu1 = nn.ReLU(inplace=True)
self.dwconv1 = DWConvBlock(32,64,kernel_size=3,padding=1)
self.dwconv2 = DWConvBlock(64,128,3,stride=2,padding=1)
self.dwconv3 = DWConvBlock(128, 128, kernel_size=3, padding=1)
self.dwconv4 = DWConvBlock(128, 256, kernel_size=3,stride=2,padding=1)
self.dwconv5 = DWConvBlock(256, 256, kernel_size=3, stride=1, padding=1)
self.dwconv6 = DWConvBlock(256, 512, kernel_size=3, stride=2, padding=1)
self.dwconv_x5 = nn.ModuleList([DWConvBlock(512,512,3,1,1)]*5)
self.dwconv7 = DWConvBlock(512,1024,3,2,1)
self.avgpool = nn.AdaptiveAvgPool2d(1)
self.flatten = nn.Flatten()
self.fc = nn.Linear(1024,num_class)
self.softmax = nn.Softmax(dim=1)
def forward(self,x):
x = self.conv1(x)
x = self.relu1(x)
x = self.dwconv1(x)
x = self.dwconv2(x)
x = self.dwconv3(x)
x = self.dwconv4(x)
x = self.dwconv5(x)
x = self.dwconv6(x)
for dwconv in self.dwconv_x5:
x = dwconv(x)
x = self.dwconv7(x)
x = self.avgpool(x)
x = self.flatten(x)
x = self.fc(x)
x = self.softmax(x)
return x
MobileNetV2(CVPR2018)
MobileNetV1的缺陷:
- 没有残差连接
- Deepwise 卷积核训练出来都是0->(卷积核权重数量和通道数量太少,即只有二维上的空间信息,太少。精度低int8、int16ReLu)
V2改进
先pointwise用ReLU6激活再进行深度可分离卷积最后降维用线性激活函数
类似于ResNet的bottleneck
为什么使用ReLu6激活函数?
传统的ReLu激活值可以取无穷大,但轻量化模型用低精度较少的比特数表示权重为int类型(int8、int16)表示不了这么大的数,所以,使用ReLu6可以在低精度表示下有较好的效果!
Bottleneck residual block
先1x1卷积进行channel生维,然后在高维空间用深度可分离卷积进行降维。倒残差两边窄中间宽,resnet是两边宽中间窄
由这张图,可以看出,只有在没进行下采样的block并且输入与输出shape相同时才进行残差连接,有下采样就不进行
-
t: 维度扩张的倍数。
-
c: 输出feature map的channel数
-
n: bottleneck重复的次数
-
s: 步长
注意:这里第一个bottleneck的维度没有扩张,所以在第一个bottleneck不需要使用1x1的卷积进行升维!
pytorch实现
#卷积block
class ConvBNReLu(nn.Module):
def __init__(self,in_channel,out_channel,kernel_size=3,stride=1,groups=1):
super(ConvBNReLu, self).__init__()
padding = (kernel_size - 1) // 2
self.conv = nn.Sequential(
nn.Conv2d(in_channel,out_channel,kernel_size,stride,padding,groups=groups,bias=False),
nn.BatchNorm2d(out_channel),
nn.ReLU6(inplace=True)
)
def forward(self,x):
x = self.conv(x)
return x
class InvertedResidual(nn.Module):
def __init__(self,in_channel,out_channel,stride,expand_ratio):
super(InvertedResidual, self).__init__()
"""
之所以叫倒残差,是因为传统残差是先降维再升维,而这里是先升维再降维
"""
#升维
hidden_channel = in_channel * expand_ratio
#在进行下采样的时候才进行残差连接
#并且第一个bottleneck没进行升维度,输入的channel!=out_channel
self.use_shortcut = stride == 1 and in_channel == out_channel
layers = []
#1x1升维
if expand_ratio != 1:
layers.append(ConvBNReLu(in_channel,hidden_channel,kernel_size=1))
layers.extend([
#depthwise
ConvBNReLu(hidden_channel,hidden_channel,stride=stride,groups=hidden_channel),
#pointwise(linear activation)
nn.Conv2d(hidden_channel,out_channel,kernel_size=1,bias=False),
nn.BatchNorm2d(out_channel)
])
self.conv = nn.Sequential(*layers)
def forward(self,x):
if self.use_shortcut:
return x + self.conv(x)
else:
return self.conv(x)
主网络结构
pytorch实现
class MobileNetV2(nn.Module):
def __init__(self,num_classses=5,alpha=1.0,round_nearest=8):
super(MobileNetV2, self).__init__()
"""
alpha控制卷积层的卷积核个数
make_divisible把输出的通道数调整为round_nearest的整数倍
"""
input_channel = make_divisible(32*alpha,round_nearest)
last_channel = make_divisible(1280 * alpha, round_nearest)
#对应论文中的bottleneckblock的配置表格
inverted_residual_setting = [
#t, c, n, s
[1,16,1,1], # t: 维度扩张的倍数
[6,24,2,2], # c: 输出feature map的channel数
[6,32,3,2], # n: bottleneck重复的次数
[6,64,4,2], # s: 步长
[6,96,3,1],
[6,160,3,2],
[6,320,1,1]
]
features = []
#conv1
features.append(ConvBNReLu(3,input_channel,stride=2))
#遍历bottleneck配置表
for t, c, n, s in inverted_residual_setting:
out_channel = make_divisible(c * alpha,round_nearest)
#bottleneck重复次数
for i in range(n):
#每个重复的block的第一层步长都为1
stride = s if i == 0 else 1
features.append(InvertedResidual(input_channel,out_channel,stride,expand_ratio=t))
input_channel = out_channel
#last_conv1*1
features.append(ConvBNReLu(input_channel,last_channel,1))
self.features = nn.Sequential(*features)
#最后输出
self.avgpool = nn.AdaptiveAvgPool2d(1)
self.classifier = nn.Sequential(
nn.Dropout(0.2),
nn.Linear(last_channel,num_classses),
nn.Softmax(dim=1)
)
#初始化权重
for m in self.modules():
#如果是卷积层就对权重进行初始化,如果有bias就置为0
#把BN层的均值置为0,方差置为1
#线性层权重初始化为均值为0方差为0.01的高斯分布
if isinstance(m,nn.Conv2d):
#使用正态分布对输入张量进行赋值
nn.init.kaiming_normal_(m.weight,mode='fan_out')
if m.bias is not None:
nn.init.zeros_(m.bias)
elif isinstance(m,nn.BatchNorm2d):
nn.init.ones_(m.weight)
nn.init.zeros_(m.bias)
elif isinstance(m,nn.Linear):
nn.init.normal_(m.weight,0,0.01)
nn.init.zeros_(m.bias)
def forward(self,x):
x = self.features(x)
x = self.avgpool(x)
x = torch.flatten(x,1)
x = self.classifier(x)
MobileNetV3(ICCV2019)
对比Mobilenetv2,MobileNetV3最主要的更新有两点:
1、加入了SE通道注意力机制
2、更新了激活函数
SE模块
对feature map进行平均池化得到长度等于feature map channel数的一维向量,经过一层FC,输出feature为输入feature map的四分之一,第二个FC输出则为输入的channel数,得到feature map各个channel的重要性权重!
减少第一层conv的个数32减到16准确率没什么大的变化
简化了最后的输出,效果也差别不大当缩短了
激活函数
v2版本使用的是ReLu6
而这里对Swish(x)激活函数进行修改
Swish(x)=x∗Sigmoid(x)
swish[x]这个函数计算和求导较为复杂,对量化过程不友好,所以对其进行改进一下
首先是对sigmoid进行修改,变为h-sigmoid
利用ReLU有几点好处:1.可以在任何软硬件平台进行计算,2.量化的时候,它消除了潜在的精度损失,使用h-swish替换swith,在量化模式下回提高大约15%的效率,另外,h-swish在深层网络中更加明显。
h − s i g m o i d = R e L U 6 ( x + 3 ) 6 h-sigmoid = \frac{ReLU6(x+3)}{6} h−sigmoid=6ReLU6(x+3)
带入Swish(x)变为 h-swish[x]
h − s w i s h [ x ] = x ∗ R e L U 6 ( x + 3 ) 6 h-swish[x]=x * \frac{ReLU6(x+3)}{6} h−swish[x]=x∗6ReLU6(x+3)
MobileNetV3两个版本:
Large版本
-
NL:非线性激活函数,HS:h-swish RE:ReLu
-
S: 步长
def mobilenet_v3_large(num_classes=5, reduced_tail=False): width_multi = 1.0 bneck_conf = partial(InvertedResidualConfig,width_multi=width_multi) adjust_channels = partial(InvertedResidualConfig.adjust_channels,width_multi=width_multi) #进一步减少最后两个bneck的参数 reduce_divider = 2 if reduced_tail else 1 #large config inverted_residual_setting = [ #in_c, k_s, expand_c, out_c, use_se, activation, stride bneck_conf(16, 3, 16, 16, False, "RE", 1), bneck_conf(16, 3, 64, 24, False, "RE", 2), bneck_conf(24, 3, 72, 24, False, "RE", 1), bneck_conf(24, 5, 72, 40, True, "RE", 2), bneck_conf(40, 5, 120, 40, True, "RE",1), bneck_conf(40, 5, 120, 40, True, "RE", 1), bneck_conf(40, 3, 240, 80, False, "HS", 2), bneck_conf(80, 3, 200, 80, False, "HS", 1), bneck_conf(80, 3, 184, 80, False, "HS", 1), bneck_conf(80, 3, 184, 80, False, "HS", 1), bneck_conf(80, 3, 480, 112, True, "HS", 1), bneck_conf(112, 3, 672, 112, True, "HS", 1), bneck_conf(112, 5, 672, 160, True, "HS", 2), bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160, True, "HS", 1), bneck_conf(160 // reduce_divider, 5, 960 // reduce_divider, 160, True, "HS", 1) ] last_channel = adjust_channels(1280 // reduce_divider) return MobileNetV3(inverted_residual_setting=inverted_residual_setting, last_channel=last_channel, num_classses=num_classes)
Small版本
def mobilenet_v3_small(num_classes=5,
reduced_tail=False):
width_multi = 1.0
bneck_conf = partial(InvertedResidualConfig,width_multi=width_multi)
adjust_channels = partial(InvertedResidualConfig.adjust_channels,width_multi=width_multi)
#进一步减少最后两个bneck的参数
reduce_divider = 2 if reduced_tail else 1
#large config
inverted_residual_setting = [
#in_c, k_s, expand_c, out_c, use_se, activation, stride
bneck_conf(16, 3, 16, 16, True, "RE", 2),
bneck_conf(16, 3, 72, 24, False, "RE", 2),
bneck_conf(24, 3, 88, 24, False, "RE", 1),
bneck_conf(24, 5, 96, 40, True, "HS", 2),
bneck_conf(40, 5, 240, 40, True, "HS",1),
bneck_conf(40, 5, 240, 40, True, "HS", 1),
bneck_conf(40, 5, 120, 48, True, "HS", 1),
bneck_conf(48, 5, 144, 48, True, "HS", 1),
bneck_conf(48, 5, 288, 96, True, "HS", 2),
bneck_conf(96 // reduce_divider, 5, 576 // reduce_divider, 96, True, "HS", 1),
bneck_conf(96 // reduce_divider, 5, 576 // reduce_divider, 96, True, "HS", 1)
]
last_channel = adjust_channels(1024 // reduce_divider)
return MobileNetV3(inverted_residual_setting=inverted_residual_setting,
last_channel=last_channel,
num_classses=num_classes)
代码
卷积block
class ConvBNActivation(nn.Module):
def __init__(self,in_channel:int,
out_channel:int,
kernel_size: int = 3,
stride: int = 1,
groups: int = 1,
norm_layer=None,
activation_layer=None):
super(ConvBNActivation, self).__init__()
padding = (kernel_size - 1) // 2
if norm_layer is None:
self.norm_layer = nn.BatchNorm2d
if activation_layer is None:
self.activation_layer = nn.ReLU6
self.Conv = nn.Sequential(
nn.Conv2d(in_channel,out_channel,kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=groups,
bias=False),
norm_layer(out_channel),
activation_layer(inplace=True))
def forward(self,x):
x = self.Conv(x)
return x
SE通道注意力机制机制
#SE Block
class SqueezeExcitaion(nn.Module):
def __init__(self,input_channel:int,
squeeze_facetor:int=4):
super(SqueezeExcitaion, self).__init__()
squeeze_c = make_divisible(input_channel // squeeze_facetor,8)
self.fc1 = nn.Conv2d(input_channel,squeeze_c,1)
self.fc2 = nn.Conv2d(squeeze_c,input_channel,1)
def forward(self,x):
scale = F.adaptive_max_pool2d(x,output_size=(1,1))
scale = self.fc1(scale)
scale = F.relu(scale,inplace=True)
scale = self.fc2(scale)
scale = F.hardsigmoid(scale,inplace=True)
return scale * x
倒残差Block
#倒残差Block
class InvertedResidual(nn.Module):
def __init__(self,
cnf:InvertedResidualConfig,
norm_layer):
super(InvertedResidual, self).__init__()
if cnf.stride not in [1,2]:
raise ValueError("illegal stride value")
#判断是否使用残差连接
self.use_res_connect = (cnf.stride == 1 and cnf.input_c == cnf.out_c)
layers = []
activation_layer = nn.Hardswish if cnf.use_hs else nn.ReLU
#expand
if cnf.expanded_c != cnf.input_c:
layers.append(ConvBNActivation(cnf.input_c,
cnf.expanded_c,
kernel_size=1,
norm_layer=norm_layer,
activation_layer=activation_layer))
#depthwise
layers.append(ConvBNActivation(cnf.expanded_c,
cnf.expanded_c,
kernel_size=cnf.kernel,
stride=cnf.stride,
groups=cnf.expanded_c,
norm_layer=norm_layer,
activation_layer=activation_layer
))
#为depthwise的结果加上通道注意力SE
if cnf.use_se:
layers.append(SqueezeExcitaion(cnf.expanded_c))
#projetcion
layers.append((ConvBNActivation(cnf.expanded_c,
cnf.out_c,
kernel_size=1,
norm_layer=norm_layer,
activation_layer=nn.Identity
)))
self.block = nn.Sequential(*layers)
self.out_channels = cnf.out_c
def forward(self,x):
result = self.block(x)
if self.use_res_connect:
result += x
return result
bneck参数配置
class InvertedResidualConfig:
def __init__(self,
input_c: int,
kernel: int,
expanded_c: int,
out_c: int,
use_se: bool,
activation: str,
stride: int,
width_multi: float
):
self.input_c = self.adjust_channels(input_c,width_multi)
self.kernel = kernel
self.expanded_c = self.adjust_channels(expanded_c,width_multi)
self.out_c = self.adjust_channels(out_c,width_multi)
self.use_se = use_se
self.use_hs = activation == "HS"
self.stride = stride
@staticmethod
def adjust_channels(channels:int,width_multi:float):
return make_divisible(channels * width_multi,8)
paper:
v1:https://arxiv.org/abs/1704.04861
v2:https://arxiv.org/abs/1801.04381
v3:https://arxiv.org/abs/1905.02244
)
self.use_se = use_se
self.use_hs = activation == “HS”
self.stride = stride
@staticmethod
def adjust_channels(channels:int,width_multi:float):
return make_divisible(channels * width_multi,8)
参考:B站up[同济子豪兄](https://www.bilibili.com/video/BV16b4y117XH/?spm_id_from=333.337.search-card.all.click&vd_source=6bd2ca80c4b1977555214d328354fb03)、[[霹雳吧啦Wz]](https://www.bilibili.com/video/BV1GK4y1p7uE/?spm_id_from=333.788&vd_source=6bd2ca80c4b1977555214d328354fb03)
paper:
v1:https://arxiv.org/abs/1704.04861
v2:https://arxiv.org/abs/1801.04381
v3:https://arxiv.org/abs/1905.02244