GhostNet: More Features from Cheap Operations
PDF: https://arxiv.org/pdf/1911.11907.pdf
PyTorch代码: https://github.com/shanglianlm0525/PyTorch-Networks
1 Motivation
作者分析了一些训练好的网络输出的特征图,发现其中存在大量冗余信息, 一些特征图可以可以由其他特征图经过一些简单的变化得到. 因此提出了能用更少参数提取更多特征的Ghost模块.
下图为输入图像经ResNet50产生的特征图,里面有许多成对的相似特征图
2 Ghost Module
Ghost模块将普通的卷积层分解为两个部分,第一部分包含了正常的卷积,但是卷积的数量会被严格控制。给在定第一部分的固有特征图之后,然后应用一系列简单的线性运算以生成更多特征图。
PyTorch代码:
class GhostModule(nn.Module):
def __init__(self, in_channels,out_channels,s=2, kernel_size=1,stride=1, use_relu=True):
super(GhostModule, self).__init__()
intrinsic_channels = out_channels//s
ghost_channels = intrinsic_channels * (s - 1)
self.primary_conv = nn.Sequential(
nn.Conv2d(in_channels=in_channels, out_channels=intrinsic_channels, kernel_size=kernel_size, stride=stride,
padding=kernel_size // 2, bias=False),
nn.BatchNorm2d(intrinsic_channels),
nn.ReLU(inplace=True) if use_relu else nn.Sequential()
)
self.cheap_op = DW_Conv3x3BNReLU(in_channels=intrinsic_channels, out_channels=ghost_channels, stride=stride,groups=intrinsic_channels)
def forward(self, x):
y = self.primary_conv(x)
z = self.cheap_op(y)
out = torch.cat([y, z], dim=1)
return out
复杂度分析:
与普通卷积神经网络相比,在不更改输出特征图大小的情况下,该Ghost模块中所需的参数总数和计算复杂度均已降低。
3 Ghost Bottlenecks
在Ghost模块的基础上,作者搭建了Ghost bottleneck来建立轻量化的模型。
PyTorch代码:
class GhostBottleneck(nn.Module):
def __init__(self, in_channels,mid_channels, out_channels , kernel_size, stride, use_se, se_kernel_size=1):
super(GhostBottleneck, self).__init__()
self.stride = stride
self.bottleneck = nn.Sequential(
GhostModule(in_channels=in_channels,out_channels=mid_channels,kernel_size=1,use_relu=True),
DW_Conv3x3BNReLU(in_channels=mid_channels, out_channels=mid_channels, stride=stride,groups=mid_channels) if self.stride>1 else nn.Sequential(),
SqueezeAndExcite(mid_channels,mid_channels,se_kernel_size) if use_se else nn.Sequential(),
GhostModule(in_channels=mid_channels, out_channels=out_channels, kernel_size=1, use_relu=False)
)
if self.stride>1:
self.shortcut = DW_Conv3x3BNReLU(in_channels=in_channels, out_channels=out_channels, stride=stride)
else:
self.shortcut = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=1)
def forward(self, x):
out = self.bottleneck(x)
residual = self.shortcut(x)
out += residual
return out
4 GhostNet
PyTorch代码:
import torch
import torch.nn as nn
import torchvision
def DW_Conv3x3BNReLU(in_channels,out_channels,stride,groups=1):
return nn.Sequential(
nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=stride, padding=1,groups=groups, bias=False),
nn.BatchNorm2d(out_channels),
nn.ReLU6(inplace=True)
)
class SqueezeAndExcite(nn.Module):
def __init__(self, in_channels, out_channels, divide=4):
super(SqueezeAndExcite, self).__init__()
mid_channels = in_channels // divide
self.pool = nn.AdaptiveAvgPool2d(1)
self.SEblock = nn.Sequential(
nn.Linear(in_features=in_channels, out_features=mid_channels),
nn.ReLU6(inplace=True),
nn.Linear(in_features=mid_channels, out_features=out_channels),
nn.ReLU6(inplace=True),
)
def forward(self, x):
b, c, h, w = x.size()
out = self.pool(x)
out = out.view(b, -1)
out = self.SEblock(out)
out = out.view(b, c, 1, 1)
return out * x
class GhostNet(nn.Module):
def __init__(self, num_classes=1000):
super(GhostNet, self).__init__()
self.first_conv = nn.Sequential(
nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=2, padding=1),
nn.BatchNorm2d(16),
nn.ReLU6(inplace=True),
)
self.features = nn.Sequential(
GhostBottleneck(in_channels=16, mid_channels=16, out_channels=16, kernel_size=3, stride=1, use_se=False),
GhostBottleneck(in_channels=16, mid_channels=64, out_channels=24, kernel_size=3, stride=2, use_se=False),
GhostBottleneck(in_channels=24, mid_channels=72, out_channels=24, kernel_size=3, stride=1, use_se=False),
GhostBottleneck(in_channels=24, mid_channels=72, out_channels=40, kernel_size=5, stride=2, use_se=True, se_kernel_size=28),
GhostBottleneck(in_channels=40, mid_channels=120, out_channels=40, kernel_size=5, stride=1, use_se=True, se_kernel_size=28),
GhostBottleneck(in_channels=40, mid_channels=120, out_channels=40, kernel_size=5, stride=1, use_se=True, se_kernel_size=28),
GhostBottleneck(in_channels=40, mid_channels=240, out_channels=80, kernel_size=3, stride=1, use_se=False),
GhostBottleneck(in_channels=80, mid_channels=200, out_channels=80, kernel_size=3, stride=1, use_se=False),
GhostBottleneck(in_channels=80, mid_channels=184, out_channels=80, kernel_size=3, stride=2, use_se=False),
GhostBottleneck(in_channels=80, mid_channels=184, out_channels=80, kernel_size=3, stride=1, use_se=False),
GhostBottleneck(in_channels=80, mid_channels=480, out_channels=112, kernel_size=3, stride=1, use_se=True, se_kernel_size=14),
GhostBottleneck(in_channels=112, mid_channels=672, out_channels=112, kernel_size=3, stride=1, use_se=True, se_kernel_size=14),
GhostBottleneck(in_channels=112, mid_channels=672, out_channels=160, kernel_size=5, stride=2, use_se=True,se_kernel_size=7),
GhostBottleneck(in_channels=160, mid_channels=960, out_channels=160, kernel_size=5, stride=1, use_se=True,se_kernel_size=7),
GhostBottleneck(in_channels=160, mid_channels=960, out_channels=160, kernel_size=5, stride=1, use_se=True,se_kernel_size=7),
)
self.last_stage = nn.Sequential(
nn.Conv2d(in_channels=160, out_channels=960, kernel_size=1, stride=1),
nn.BatchNorm2d(960),
nn.ReLU6(inplace=True),
nn.AvgPool2d(kernel_size=7, stride=1),
nn.Conv2d(in_channels=960, out_channels=1280, kernel_size=1, stride=1),
nn.ReLU6(inplace=True),
)
self.classifier = nn.Linear(in_features=1280,out_features=num_classes)
def init_params(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear) or isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
def forward(self, x):
x = self.first_conv(x)
x = self.features(x)
x= self.last_stage(x)
x = x.view(x.size(0), -1)
out = self.classifier(x)
return out
if __name__ == '__main__':
model = GhostNet()
print(model)
input = torch.randn(1, 3, 224, 224)
out = model(input)
print(out.shape)
5 Ablation
- d=3的表现最好,这是因为1x1的卷积核无法在特征图上引入空间信息,而d为5或者是7导致了过拟合核更大的计算量
- 当增加s,无论是速度还是准确率都出现下降,当s为2时,代表着VGG16被压缩了两倍,作者提出的方法表现比VGG16还要好一点
6 Visualization of Feature Maps
如下图可以看出Ghost其实使得同一个特征图中不同通道包含了不同的特征信息,增强了模型的表现力