1.前置
之前对语义分割看待的角度都是微观的,即实现每一个像素的分类,而DFN从宏观角度来分析语义分割,将语义分割看作一个任务,将一致的语义标签分配给一类事物,而不是每个像素。按照这一角度,语义分割中就会存在类间不一致和类内不一致的问题。
注意力机制:把注意力集中到重要的点上,忽略其他不重要的点。注意力机制可以帮助模型为输入图像的各个部分分配不同的权重,提取更关键、更重要的信息,使模型能够做出更准确的判断,同时不会给模型的计算和存储带来更多的消耗。按注意力关注的域分类,有空间域,通道域,层域,混合域,时间域。DANet引入了一种自注意力机制来分别捕捉空间域和通道域中的视觉特征依赖性。
这里其实不太明白为什么最后还要做一个相加的操作,或者说为什么最后不相乘呢?个人粗浅的理解这个注意力应该是类似一个权重矩阵,对应到原来图像上的像素,有的像素重要权重就大,有的像素不重要权重就小,不是很明白把这个权重做加法到像素的意义?
ExFuse摘要:
- 背景介绍:现代语义分割框架通常使用特征融合的方式提升性能,但由于高级特征和低级特征之间存在差距,直接融合的效果并不是很好
- 文章思想:在低层特征中引入语义信息,在高层特征中引入细节信息会使后续融合更有效
- 文章亮点:提出了ExFuse,用来弥补高低层特征之间的差距
- 模型评估:在PASCAL VOC 2012数据集中得到了87.9%的MloU
ExFuse网络认为直接将低级特征和高级特征做融合得到的效果其实并没有想象中的好,如上图上半部分所示,融合出的特征图未知,而如果将高级特征和低级特征先提前相互引入然后通过边界对齐的方式做融合,那么出来的结果则是比较明显的。
ExFuse用了ResNeXt作为主网络,ResNeXt可以说是ResNet的升级版,它可以在不增加参数复杂度的前提下提高准确率,同时还减少了超参数的数量。相比较于传统网络不断加深网络深度来说,个人认为ResNeXt是加深了单个模块的宽度,如下图所示,其将256通道数直接骤减为4通道数,同时并行32路最后做融合。
DFN摘要:
- 背景介绍:现代语义分割算法会存在类内不一致和类间不一致的问题
- 算法组成:提出了DFN网络,包括平滑网络和边界网络两部分
- 具体作用:平滑网络用于解决类内不一致,通过引入注意力机制和全局平均池化选择更具代表性的特征;边界网络通过深度语义边界监督更好的区分双边特征
- 模型评估:在PASCAL VOC 2012和Cityscapes数据集中得到了86.2%和80.3%的MIloU
类内不一致:所属同一类,但是模型将其分错,认为是不同的类,多尺度和上下文信息对类内一致有帮助
类间不一致:所属不同类,但是他们的特征可能很像,模型认为他们属于同一类
2.Pytorch实现ExFuse&DFN
Exfuse(SS模块和DAP模块未实现):
from PIL import Image
import torch.nn as nn
from torchvision import models
import torch
resnet101=models.resnet101(pretrained=True)
class SEB(nn.Module):
def __init__(self,in_channels,out_channels):
super(SEB,self).__init__()
self.conv=nn.Conv2d(in_channels,out_channels,kernel_size=3,stride=1,padding=1)
self.upsample=nn.Upsample(scale_factor=2,mode="bilinear")#上采样2倍
def forward(self,x):#送入的x包含两个值
x1,x2=x
return x1*self.upsample(self.conv(x2))
class ECRE(nn.Module):
def __init__(self,in_c,up_scale=2):#up_scale代表还原倍数
super(ECRE,self).__init__()
self.ecre=nn.Sequential(nn.Conv2d(in_c,in_c*(up_scale**2),kernel_size=3,stride=1,padding=1),
nn.BatchNorm2d(in_c*(up_scale**2)),
nn.PixelShuffle(up_scale))#封装好的函数,可以将r方channels的图转换为最后的输出,底层C++实现
def forward(self,input_):
return self.ecre(input_)
class _GlobalConvModule(nn.Module):
def __init__(self,in_channels,num_class,k=15):
super(_GlobalConvModule,self).__init__()
pad=(k-1)//2
self.conv1=nn.Sequential(nn.Conv2d(in_channels,num_class,kernel_size=(1,k),padding=(0,pad),bias=False),
nn.Conv2d(num_class,num_class,kernel_size=(k,1),padding=(pad,0),bias=False)
)
self.conv2=nn.Sequential(nn.Conv2d(in_channels,num_class,kernel_size=(k,1),padding=(pad,0),bias=False),
nn.Conv2d(num_class,num_class,kernel_size=(1,k),padding=(0,pad),bias=False)
)
def forward(self,x):
x1=self.conv1(x)
x2=self.conv2(x)
assert x1.shape==x2.shape
return x1+x2
class GCNFuse(nn.Module):
def __init__(self,num_classes=21):
super(GCNFuse, self).__init__()
self.num_classes=num_classes
self.resnet_features=models.resnet101(pretrained=True)
self.layer0 = nn.Sequential(self.resnet_features.conv1, self.resnet_features.bn1, self.resnet_features.relu)
self.layer1 = nn.Sequential(self.resnet_features.maxpool, self.resnet_features.layer1)
self.layer2 = self.resnet_features.layer2
self.layer3 = self.resnet_features.layer3
self.layer4 = self.resnet_features.layer4
self.gcm4=_GlobalConvModule(2048,num_classes)
self.gcm3 = _GlobalConvModule(1024, num_classes)
self.gcm2 = _GlobalConvModule(512, num_classes)
self.gcm1 = _GlobalConvModule(256, num_classes)
self.deconv3 = nn.ConvTranspose2d(num_classes, num_classes, kernel_size=4, stride=2, padding=1, bias=False)
self.deconv2 = nn.ConvTranspose2d(num_classes, num_classes, kernel_size=4, stride=2, padding=1, bias=False)
self.deconv1 = nn.ConvTranspose2d(num_classes, num_classes, kernel_size=4, stride=2, padding=1, bias=False)
self.deconv0 = nn.ConvTranspose2d(num_classes, num_classes, kernel_size=4, stride=2, padding=1, bias=False)
self.ecre=ECRE(num_classes)
self.seb3=SEB(2048,1024)
self.seb2 = SEB(3072, 512)
self.seb1 = SEB(3584, 256)
self.upsample2=nn.Upsample(scale_factor=2,mode="bilinear")
self.upsample4 = nn.Upsample(scale_factor=4, mode="bilinear")
def forward(self,x):
f0=self.layer0(x)
f1=self.layer1(f0)
f2=self.layer2(f1)
f3=self.layer3(f2)
f4=self.layer4(f3)
gcm4=self.gcm4(f4)
out4=self.ecre(gcm4)
seb3=self.seb3([f3,f4])
gcm3=self.gcm3(seb3)
seb2=self.seb2([f2,torch.cat([f3,self.upsample2(f4)],dim=1)])
gcm2=self.gcm2(seb2)
seb1=self.seb1([f1,torch.cat([f2,self.upsample2(f3),self.upsample4(f4)],dim=1)])
gcm1=self.gcm1(seb1)
y=self.deconv3(gcm3+out4)
y = self.deconv2(gcm2 + y)
y = self.deconv1(gcm1 + y)
y=self.deconv0(y)
return y
if __name__ == "__main__":
model=GCNFuse(21)
model.eval()
image=torch.randn(1,3,512,512)
res1=model(image)
print('result:',res1.size())
DFN.py
from PIL import Image
import torch.nn as nn
from torchvision import models
import torch
resnet101=models.resnet101(pretrained=True)
class RRB(nn.Module):
def __init__(self,in_channels,out_channels):
super(RRB,self).__init__()
self.conv1=nn.Conv2d(in_channels,out_channels,kernel_size=1,stride=1,padding=0)
self.conv2=nn.Conv2d(out_channels,out_channels,kernel_size=3,stride=1,padding=1)
self.relu=nn.ReLU()
self.bn=nn.BatchNorm2d(out_channels)
self.conv3=nn.Conv2d(out_channels,out_channels,kernel_size=3,stride=1,padding=1)
def forward(self,x):
x=self.conv1(x)
res=self.conv2(x)
res=self.bn(res)
res=self.relu(res)
res=self.conv3(res)
return self.relu(x+res)
class CAB(nn.Module):
def __init__(self,in_channels,out_channels):
super(CAB,self).__init__()
self.global_pooling=nn.AdaptiveAvgPool2d(1)
self.conv1=nn.Conv2d(in_channels,out_channels,kernel_size=1,stride=1,padding=0)
self.relu=nn.ReLU()
self.conv2=nn.Conv2d(out_channels,out_channels,kernel_size=1,stride=1,padding=0)
self.sigmod=nn.Sigmoid()
def forward(self,x):
x1,x2=x
x=torch.cat([x1,x2],dim=1)
x=self.global_pooling(x)
x=self.conv1(x)
x=self.relu(x)
x=self.conv2(x)
x=self.sigmod(x)
x2=x*x2
res=x2+x1
return res
class DFN(nn.Module):
def __init__(self,num_class=21):
super(DFN,self).__init__()
self.num_class=num_class
self.layer0 = nn.Sequential(resnet101.conv1, resnet101.bn1,resnet101.relu)
self.layer1 = nn.Sequential(resnet101.maxpool, resnet101.layer1)
self.layer2 = resnet101.layer2
self.layer3 = resnet101.layer3
self.layer4 = resnet101.layer4
#this is for smooth network
self.out_conv=nn.Conv2d(2048,self.num_class,kernel_size=1,stride=1)
self.global_pool=nn.AdaptiveAvgPool2d(1)
self.cab1=CAB(self.num_class*2,self.num_class)
self.cab2 = CAB(self.num_class * 2, self.num_class)
self.cab3 = CAB(self.num_class * 2, self.num_class)
self.cab4 = CAB(self.num_class * 2, self.num_class)
self.rrb_d_1=RRB(256,self.num_class)
self.rrb_d_2 = RRB(512, self.num_class)
self.rrb_d_3 = RRB(1024, self.num_class)
self.rrb_d_4 = RRB(2048, self.num_class)
self.upsample=nn.Upsample(scale_factor=2,mode="bilinear")
self.upsample_4 = nn.Upsample(scale_factor=4, mode="bilinear")
self.upsample_8 = nn.Upsample(scale_factor=8, mode="bilinear")
self.rrb_u_4=RRB(self.num_class,self.num_class)
self.rrb_u_3 = RRB(self.num_class, self.num_class)
self.rrb_u_2 = RRB(self.num_class, self.num_class)
self.rrb_u_1 = RRB(self.num_class, self.num_class)
#this is for boarder net work
self.rrb_db_1 = RRB(256, self.num_class)
self.rrb_db_2 = RRB(512, self.num_class)
self.rrb_db_3 = RRB(1024, self.num_class)
self.rrb_db_4 = RRB(2048, self.num_class)
self.rrb_trans_1 = RRB(self.num_class, self.num_class)
self.rrb_trans_2 = RRB(self.num_class, self.num_class)
self.rrb_trans_3 = RRB(self.num_class, self.num_class)
def forward(self,x):
f0 = self.layer0(x)
f1 = self.layer1(f0)
f2 = self.layer2(f1)
f3 = self.layer3(f2)
f4 = self.layer4(f3)
#border
res1=self.rrb_db_1(f1)
res1=self.rrb_trans_1(res1+self.upsample(self.rrb_db_2(f2)))
res1=self.rrb_trans_2(res1+self.upsample_4(self.rrb_db_3(f3)))
res1 = self.rrb_trans_3(res1 + self.upsample_8(self.rrb_db_4(f4)))
#smooth
res2=self.out_conv(f4)
res2=self.global_pool(res2)
res2=nn.Upsample(size=f4.size()[2:],mode="nearest")(res2)#因为要进入CAB做contract,需要调整尺寸和f4相同
f4=self.rrb_d_4(f4)
res2=self.cab4([res2,f4])
res2=self.rrb_u_4(res2)
f3 = self.rrb_d_3(f3)
res2=self.cab3([self.upsample(res2),f3])
res2 = self.rrb_u_3(res2)
f2=self.rrb_d_2(f2)
res2=self.cab2([self.upsample(res2),f2])
res2 = self.rrb_u_2(res2)
f1 = self.rrb_d_1(f1)
res2 = self.cab1([self.upsample(res2), f1])
res2=self.rrb_u_1(res2)
return res1,res2
if __name__ == '__main__':
import torch as t
model = DFN(21)
model.eval()
image = t.randn(1, 3, 512, 512)
res1, res2 = model(image)
print(res1.size(), res2.size())
参考:
B站深度之眼