自从VGG网络的出现,大卷积核逐渐被人们遗忘。人们更喜欢堆叠小卷积核来采样特征图。他们认为这样做可以再保证感受野不变的情况下减少参数,从而提升整个网络的效率。
大卷积真的那么狼狈不堪么?如果觉得他参数多的话咱们可以引入可分离卷积!不要一步卷积,分两部走。这样做的话参数可以减小到原来的1/C ,C为输入通道数。
GCN原文觉得这是一个涉及到分类和定位的问题。这篇文章巧妙的在中间找到了一个平衡点。他把之前的经典分割网络分为了这2类
分类派:AlexNet,VGG Net, GoogleNet or ResNet
定位派: FCN, U-Net , DeepLab,Deconv Net
首先先附上代码
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
import torch.utils.model_zoo as model_zoo
from torchvision import models
import cfg
import math
class GCN(nn.Module):
def __init__(self, inplanes, planes , ks=7):
super(GCN, self).__init__()
self.conv_l1 = nn.Conv2d(inplanes, planes, kernel_size=(ks, 1),
padding=(ks//2, 0))
self.conv_l2 = nn.Conv2d(planes, planes, kernel_size=(1, ks),
padding=(0, ks//2))
self.conv_r1 = nn.Conv2d(inplanes, planes, kernel_size=(1, ks),
padding=(0, ks//2))
self.conv_r2 = nn.Conv2d(planes, planes, kernel_size=(ks, 1),
padding=(ks//2, 0))
def forward(self, x):
x_l = self.conv_l1(x)
x_l = self.conv_l2(x_l)
x_r = self.conv_r1(x)
x_r = self.conv_r2(x_r)
x = x_l + x_r
return x
class Refine(nn.Module):
def __init__(self, planes):
super(Refine, self).__init__()
self.bn = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv1 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
def forward(self, x):
residual = x
x = self.bn(x)
x = self.relu(x)
x = self.conv1(x)
x = self.bn(x)
x = self.relu(x)
x = self.conv2(x)
out = residual + x
return out
class FCN(nn.Module):
def __init__(self, num_classes):
super(FCN, self).__init__()
self.num_classes = num_classes
resnet = models.resnet50(pretrained=True)
self.conv1 = resnet.conv1
self.bn0 = resnet.bn1
self.relu = resnet.relu
self.maxpool = resnet.maxpool
self.layer1 = resnet.layer1
self.layer2 = resnet.layer2
self.layer3 = resnet.layer3
self.layer4 = resnet.layer4
self.gcn1 = GCN(2048, self.num_classes)
self.gcn2 = GCN(1024, self.num_classes)
self.gcn3 = GCN(512, self.num_classes)
self.gcn4 = GCN(64, self.num_classes)
self.gcn5 = GCN(64, self.num_classes)
self.refine1 = Refine(self.num_classes)
self.refine2 = Refine(self.num_classes)
self.refine3 = Refine(self.num_classes)
self.refine4 = Refine(self.num_classes)
self.refine5 = Refine(self.num_classes)
self.refine6 = Refine(self.num_classes)
self.refine7 = Refine(self.num_classes)
self.refine8 = Refine(self.num_classes)
self.refine9 = Refine(self.num_classes)
self.refine10 = Refine(self.num_classes)
#self.out0 = self._classifier(2048)
#self.out1 = self._classifier(1024)
#self.out2 = self._classifier(512)
#self.out_e = self._classifier(256)
#self.out3 = self._classifier(64)
#self.out4 = self._classifier(64)
#self.out5 = self._classifier(32)
#self.transformer = nn.Conv2d(256, 64, kernel_size=1)
def _classifier(self, inplanes):
return nn.Sequential(
nn.Conv2d(inplanes, inplanes//2, 3, padding=1, bias=False),
nn.BatchNorm2d(inplanes//2),
nn.ReLU(inplace=True),
nn.Dropout(.1),
nn.Conv2d(inplanes//2, self.num_classes, 1),
)
def forward(self, x):
input = x
x = self.conv1(x)
x = self.bn0(x)
x = self.relu(x)
conv_x = x
x = self.maxpool(x)
pool_x = x
fm1 = self.layer1(x)
fm2 = self.layer2(fm1)
fm3 = self.layer3(fm2)
fm4 = self.layer4(fm3)
gcfm1 = self.refine1(self.gcn1(fm4))
gcfm2 = self.refine2(self.gcn2(fm3))
gcfm3 = self.refine3(self.gcn3(fm2))
gcfm4 = self.refine4(self.gcn4(pool_x))
gcfm5 = self.refine5(self.gcn5(conv_x))
fs1 = self.refine6(F.upsample_bilinear(gcfm1, fm3.size()[2:]) + gcfm2)
fs2 = self.refine7(F.upsample_bilinear(fs1, fm2.size()[2:]) + gcfm3)
fs3 = self.refine8(F.upsample_bilinear(fs2, pool_x.size()[2:]) + gcfm4)
fs4 = self.refine9(F.upsample_bilinear(fs3, conv_x.size()[2:]) + gcfm5)
out = self.refine10(F.upsample_bilinear(fs4, input.size()[2:]))
return out
# if __name__ == "__main__": #验证是否端到端输出 这里选用的Camvid数据集所以有12个类别!
# import torch as t
# rgb = t.randn(4, 3, 352, 480) #input
#
# net = FCN(12)
#
# out = net(rgb)
#
# print(out.shape)
然后可视化一下结构,这里是我手画的有点乱但是依然梳理得清楚
左上部分gcn小模块,总共会有5个。左下部分是Refine模块(用于改善边界)总共会有10个这样的小模块。右图是GCN的大框架,这里为了区分gcn小模块的名字,所以我把整个大框架命名为FCN(不是全卷积网络)!!!
与论文原文不同的是,框架整体到最后多做了一次和输入X的融合,所以一共做了5次融合。融合方式是FPN(通道数一样,尺寸大小一样)
实验部分
这里的下采样部分采用的是Resnet50在ImageNet预训练好的模块,我们一共做了4次下采样,所以最小的特征图为输入的2**4/1=1/16。当然你也可以下采样到1/8就停止。在这里上采样(恢复分辨率)采用的是简单粗暴的双线性插值放大而不是反卷积(原文用的是),因为反卷积的话会加大计算量,故不采用。
这次现在相对简单的Camvid数据集上做实验,如果连Camvid都搞不定,其他的数据集更搞不定…实验设备为NVIDIA Geforce RTX 3070(8G)
跑了20个epoch,batchsize设置为2(再大就要爆显存了)
显然测试集miou还没收敛,但是验证集已经开始收敛了!!!最高val miou为63%
我后来会在Pascal_vol 2012 和 Cityscapes上测试的,验证其泛化性。
GCN论文原址:https://arxiv.org/abs/1703.02719