1.前置
经典分割算法VS实时分割算法
经典分割算法注重精度,不断设计研究更复杂更强大的模型,使得模型分割精度不断提升,可能现有硬件开销较大,但随着技术的提升,这个弱点会被不断弥补。
代表算法:FCN,U-Net,SegNet,DeepLab
实时分割算法注重把模型落地投入到生产应用中,因此希望模型所耗费的计算,存储资源能够尽可能的少,牺牲一部分精度来换取轻量级稳定的模型。
代表算法:ENet,LinkNet,BiSeNet,DFA
实时分割算法常用思想(通常就是以缩减参数为主要途径)
- 替换主网络
- 减少通道数
- 减少卷积层
- 用能减少计算量的卷积(如组卷积)替代卷积层
- 增加前期数据处理
- 减少复杂融合方式
- 避免使用全连接
实时分割算法常用性能指标:
- ms :毫秒数
- fps:画面每秒传输帧数,通俗来讲就是指动画或视频的画面数
- FLOPs :每秒执行的浮点数运算次数(区分FLOPS,FLOPS为衡量硬件性能的指标)
- Parameters :模型参数
- Model size :模型大小
激活函数:
为何激活函数不用sigmoid而用ReLu?
当输入值特别大或者特别小时,对应sigmoid的斜率或者说梯度为0,会造成训练中的梯度消失问题(模型是根据梯度来进行优化的,如果梯度为0那么模型将无法优化)
ReLu则是改善了上述情况,但是仍会存在一个问题,当输入值为负数时梯度也为0。当模型学习率设置过大时,输入是会有可能存在负数的,如果用ReLu的话相当于把这些负数值全给撇了,LeakyReLu和PReLu则是给负值设置了一个非零斜率,不同之处是前者的a是给定的,后者的a是可以根据数据来定的。
2.摘要
ENet:
背景介绍:实时执行像素级语义分割的能力在移动应用程序中至关重要。最近的DCNN网络的缺点在于需要大量浮点运算,运行时间长,阻碍了语义分割的可用性
算法组成:提出了ENet,专门针对需要低延迟操作的任务
文章亮点: ENet与现存模型相比,速度提升了18倍,FLOPs减少了75倍,参数减少了79倍,并能保证不错的准确度,ENet采用了较强的编码器和较弱的解码器,作者认为解码器只是起到了恢复尺寸的作用,所以不必和编码器设置成相同的量级。
模型评估:在CamVid、Cityscapes和SUN RGBD数据集中均得到了不错的成绩
LinkNet:
背景介绍:用于场景理解的像素级语义分割不仅需要准确度,还要保证高效性,才能应用在实时
应用程序中
现有算法:现有算法虽然比较准确,但参数和计算量都比较庞大,因此速度很慢
本文算法:本文提出了一种新的DCNN网络,该体系结构使其无需增加大量参数即可进行学习
模型评估:在CamVid和Cityscapes数据集中均得到了不错的成绩
3.Pytorch实现ENet&LinkNet
ENet.py
import torch.nn as nn
from torchvision import models
import torch
class InitialBlock(nn.Module):
def __init__(self,in_channels,out_channels,bias=False,relu=True):
super().__init__()
if relu:
activation=nn.ReLU
else:
activation=nn.PReLU
self.main_branch=nn.Conv2d(in_channels,out_channels-3,kernel_size=3,stride=2,padding=1,bias=bias)#输出通道-3是为了和maxpooling做concat之后通道刚好为16
#Extension branch
self.ext_branch=nn.MaxPool2d(3,stride=2,padding=1)
#Initialize batch normalization to be used after concatenation
self.batch_norm=nn.BatchNorm2d(out_channels)
#PRelu layer to apply after concatenating the branches
self.out_activation=activation()
def forward(self,x):
main=self.main_branch(x)
ext=self.ext_branch(x)
#concatenate branches
out=torch.cat((main,ext),1)
#Apply batch normalization
out=self.batch_norm(out)
return self.out_activation(out)
class RegularBottleneck(nn.Module):#包含非对称卷积,普通卷积和空洞卷积
def __init__(self,
channels,
internal_ratio=4,
kernel_size=3,
padding=0,
dilation=1,
asymmetric=False,dropout_prob=0,bias=False,relu=True):
super().__init__()
internal_channels=channels//internal_ratio#中间通道数
if relu:
activation=nn.ReLU
else:
activation=nn.PReLU
#1*1 projection convolution
self.ext_conv1=nn.Sequential(
nn.Conv2d(channels,internal_channels,kernel_size=1,stride=1,bias=bias),
nn.BatchNorm2d(internal_channels),
activation()
)
if asymmetric:#非对称,用1*5和5*1代替一个3*3卷积核
self.ext_conv2=nn.Sequential(
nn.Conv2d(internal_channels,internal_channels,kernel_size=(kernel_size,1),
stride=1,padding=(padding,0),dilation=dilation,bias=bias),
nn.BatchNorm2d(internal_channels),
activation(),
nn.Conv2d(internal_channels, internal_channels, kernel_size=(1, kernel_size),
stride=1, padding=(0, padding), dilation=dilation, bias=bias),
nn.BatchNorm2d(internal_channels),
activation()
)
else:#普通卷积
self.ext_conv2=nn.Sequential(
nn.Conv2d(internal_channels,internal_channels,kernel_size=kernel_size,stride=1,padding=padding,dilation=dilation,bias=bias),
nn.BatchNorm2d(internal_channels),activation()
)
#1*1 expansion convolution
self.ext_conv3=nn.Sequential(
nn.Conv2d(internal_channels,channels,kernel_size=1,stride=1,bias=bias),
nn.BatchNorm2d(channels),
activation()
)
self.ext_regul=nn.Dropout2d(p=dropout_prob)
#PRelu layer to apply after adding the branches
self.out_activation=activation()
def forward(self,x):
#main branch shortcut
main=x
#Extension branch
ext=self.ext_conv1(x)
ext = self.ext_conv2(ext)
ext = self.ext_conv3(ext)
ext = self.ext_regul(ext)
#Add main and extension branches
out=main+ext
return self.out_activation(out)
class DownsamplingBottleneck(nn.Module):
def __init__(self,
in_channels,
out_channels,
internal_ratio=4,
return_indices=False,
dropout_prob=0,
bias=False,relu=True):
super().__init__()
#store parameters that are needed later
self.return_indices=return_indices
internal_channels = in_channels // internal_ratio
if relu:
activation=nn.ReLU
else:
activation=nn.PReLU
#Main branch - maxpooling followed by feature map(channels) padding
self.main_maxl=nn.MaxPool2d(2,stride=2,return_indices=return_indices)
#2*2 projection convolution with stride 2
self.ext_conv1=nn.Sequential(
nn.Conv2d(in_channels,internal_channels,kernel_size=2,stride=2,bias=bias),
nn.BatchNorm2d(internal_channels),
activation()
)
#Convolution
self.ext_conv2 = nn.Sequential(
nn.Conv2d(internal_channels, internal_channels, kernel_size=3, stride=1,padding=1, bias=bias),
nn.BatchNorm2d(internal_channels),
activation()
)
#1*1 expansion convolution
self.ext_conv3 = nn.Sequential(
nn.Conv2d(internal_channels, out_channels, kernel_size=1, stride=1, bias=bias),
nn.BatchNorm2d(out_channels),
activation()
)
self.ext_regul=nn.Dropout2d(p=dropout_prob)
# PRelu layer to apply after adding the branches
self.out_activation = activation()
def forward(self,x):
#main branch shortcut
if self.return_indices:
main,max_indices=self.main_maxl(x)
else:
main=self.main_maxl(x)
#Extension branch
ext=self.ext_conv1(x)
ext = self.ext_conv2(ext)
ext = self.ext_conv3(ext)
ext = self.ext_regul(ext)
#Main branch channel padding使尺寸相同
n,ch_ext,h,w=ext.size()
ch_main=main.size()[1]
padding=torch.zeros(n,ch_ext-ch_main,h,w)
#Before concatenating,check if main is on the CPU or GPU and
#convert padding accodingly
if main.is_cuda:
padding=padding.cuda()
#concatenate
main=torch.cat((main,padding),1)
#Add main and extension branches
out=main+ext
return self.out_activation(out),max_indices
class UpsamplingBottleneck(nn.Module):
def __init__(self,
in_channels,
out_channels,
internal_ratio=4,
dropout_prob=0,
bias=False,relu=True):
super().__init__()
internal_channels = in_channels // internal_ratio
if relu:
activation=nn.ReLU
else:
activation=nn.PReLU
# Main branch - max pooling followed by feature map(channels) padding
self.main_conv1=nn.Sequential(
nn.Conv2d(in_channels,out_channels,kernel_size=1,bias=bias),
nn.BatchNorm2d(out_channels)
)
#Remember that the stride is the same as the kernel_size,just like
#the max pooling layers
self.main_unpool1=nn.MaxUnpool2d(kernel_size=2)
#1*1 projection convlution with stride 1
self.ext_conv1 = nn.Sequential(
nn.Conv2d(in_channels, internal_channels, kernel_size=1, bias=bias),
nn.BatchNorm2d(internal_channels),
activation()
)
#Transposed convolution
self.ext_tconv1=nn.ConvTranspose2d(
internal_channels,internal_channels,kernel_size=2,stride=2,bias=bias
)
self.ext_tconv1_bnorm=nn.BatchNorm2d(internal_channels)
self.ext_tconv1_activation=activation()
#1*1 expansion convolution
self.ext_conv2 = nn.Sequential(
nn.Conv2d(internal_channels, out_channels, kernel_size=1, bias=bias),
nn.BatchNorm2d(out_channels),
activation()
)
self.ext_regul=nn.Dropout2d(p=dropout_prob)
# PRelu layer to apply after concatenating the branches
self.out_activation = activation()
def forward(self,x,max_indices,output_size):
#Main branch shortcut
main=self.main_conv1(x)
main=self.main_unpool1(main,max_indices,output_size=output_size)
#Extension branch
ext=self.ext_conv1(x)
ext=self.ext_tconv1(ext,output_size=output_size)
ext=self.ext_tconv1_bnorm(ext)
ext=self.ext_tconv1_activation(ext)
ext=self.ext_conv2(ext)
ext=self.ext_regul(ext)
#Add main and extension branches
out=main+ext
return self.out_activation(out)
class ENet(nn.Module):
def __init__(self, n_classes, encoder_relu=False, decoder_relu=True):
super().__init__()
self.initial_block = InitialBlock(3, 16, relu=encoder_relu)
# Stage 1 - Encoder
self.downsample1_0 = DownsamplingBottleneck(
16,
64,
return_indices=True,
dropout_prob=0.01,
relu=encoder_relu)
self.regular1_1 = RegularBottleneck(
64, padding=1, dropout_prob=0.01, relu=encoder_relu)
self.regular1_2 = RegularBottleneck(
64, padding=1, dropout_prob=0.01, relu=encoder_relu)
self.regular1_3 = RegularBottleneck(
64, padding=1, dropout_prob=0.01, relu=encoder_relu)
self.regular1_4 = RegularBottleneck(
64, padding=1, dropout_prob=0.01, relu=encoder_relu)
# Stage 2 - Encoder
self.downsample2_0 = DownsamplingBottleneck(
64,
128,
return_indices=True,
dropout_prob=0.1,
relu=encoder_relu)
self.regular2_1 = RegularBottleneck(
128, padding=1, dropout_prob=0.1, relu=encoder_relu)
self.dilated2_2 = RegularBottleneck(
128, dilation=2, padding=2, dropout_prob=0.1, relu=encoder_relu)
self.asymmetric2_3 = RegularBottleneck(
128,
kernel_size=5,
padding=2,
asymmetric=True,
dropout_prob=0.1,
relu=encoder_relu)
self.dilated2_4 = RegularBottleneck(
128, dilation=4, padding=4, dropout_prob=0.1, relu=encoder_relu)
self.regular2_5 = RegularBottleneck(
128, padding=1, dropout_prob=0.1, relu=encoder_relu)
self.dilated2_6 = RegularBottleneck(
128, dilation=8, padding=8, dropout_prob=0.1, relu=encoder_relu)
self.asymmetric2_7 = RegularBottleneck(
128,
kernel_size=5,
asymmetric=True,
padding=2,
dropout_prob=0.1,
relu=encoder_relu)
self.dilated2_8 = RegularBottleneck(
128, dilation=16, padding=16, dropout_prob=0.1, relu=encoder_relu)
# Stage 3 - Encoder
self.regular3_0 = RegularBottleneck(
128, padding=1, dropout_prob=0.1, relu=encoder_relu)
self.dilated3_1 = RegularBottleneck(
128, dilation=2, padding=2, dropout_prob=0.1, relu=encoder_relu)
self.asymmetric3_2 = RegularBottleneck(
128,
kernel_size=5,
padding=2,
asymmetric=True,
dropout_prob=0.1,
relu=encoder_relu)
self.dilated3_3 = RegularBottleneck(
128, dilation=4, padding=4, dropout_prob=0.1, relu=encoder_relu)
self.regular3_4 = RegularBottleneck(
128, padding=1, dropout_prob=0.1, relu=encoder_relu)
self.dilated3_5 = RegularBottleneck(
128, dilation=8, padding=8, dropout_prob=0.1, relu=encoder_relu)
self.asymmetric3_6 = RegularBottleneck(
128,
kernel_size=5,
asymmetric=True,
padding=2,
dropout_prob=0.1,
relu=encoder_relu)
self.dilated3_7 = RegularBottleneck(
128, dilation=16, padding=16, dropout_prob=0.1, relu=encoder_relu)
# Stage 4 - Decoder
self.upsample4_0 = UpsamplingBottleneck(
128, 64, dropout_prob=0.1, relu=decoder_relu)
self.regular4_1 = RegularBottleneck(
64, padding=1, dropout_prob=0.1, relu=decoder_relu)
self.regular4_2 = RegularBottleneck(
64, padding=1, dropout_prob=0.1, relu=decoder_relu)
# Stage 5 - Decoder
self.upsample5_0 = UpsamplingBottleneck(
64, 16, dropout_prob=0.1, relu=decoder_relu)
self.regular5_1 = RegularBottleneck(
16, padding=1, dropout_prob=0.1, relu=decoder_relu)
self.transposed_conv = nn.ConvTranspose2d(
16,
n_classes,
kernel_size=3,
stride=2,
padding=1,
bias=False)
def forward(self, x):
#Initial block
input_size=x.size()
x=self.initial_block(x)
#Stage 1-Encoder
stage1_input_size =x.size()
x,max_indices1_0=self.downsample1_0(x)
x=self.regular1_1(x)
x = self.regular1_2(x)
x = self.regular1_3(x)
x = self.regular1_4(x)
#Stage2-Encoder
stage2_input_size = x.size()
x, max_indices2_0 = self.downsample2_0(x)
x = self.regular2_1(x)
x = self.dilated2_2(x)
x = self.asymmetric2_3(x)
x = self.dilated2_4(x)
x = self.regular2_5(x)
x = self.dilated2_6(x)
x = self.asymmetric2_7(x)
x = self.dilated2_8(x)
# Stage3-Encoder
x = self.regular3_0(x)
x = self.dilated3_1(x)
x = self.asymmetric3_2(x)
x = self.dilated3_3(x)
x = self.regular3_4(x)
x = self.dilated3_5(x)
x = self.asymmetric3_6(x)
x = self.dilated3_7(x)
#Stage4 -Decoder
x=self.upsample4_0(x,max_indices2_0,output_size=stage2_input_size)
x=self.regular4_1(x)
x=self.regular4_2(x)
# Stage5 -Decoder
x = self.upsample5_0(x,max_indices1_0,output_size=stage1_input_size)
x = self.regular5_1(x)
x=self.transposed_conv(x,output_size=input_size)
return x
if __name__ == '__main__':
inputs = torch.randn((1, 3, 352, 480))
model = ENet(n_classes=12)
out = model(inputs)
print(out.size())
LinkNet.py
import torch.nn as nn
from torchvision import models
import torch
class BasicBlock(nn.Module):
def __init__(self,in_planes,out_planes,stride=1,padding=0,bias=False):
super(BasicBlock,self).__init__()
self.conv1=nn.Conv2d(in_planes,out_planes,3,stride,padding,bias=bias)
self.bn1=nn.BatchNorm2d(out_planes)
self.relu=nn.ReLU(inplace=True)
self.conv2=nn.Conv2d(out_planes,out_planes,3,1,padding,bias=bias)
self.bn2 = nn.BatchNorm2d(out_planes)
self.downsample=None
if stride>1:#残差连接需要对输入调整大小才能融合
self.downsample=nn.Sequential(
nn.Conv2d(in_planes,out_planes,3,stride,bias=False),
nn.BatchNorm2d(out_planes)
)
def forward(self,x):
residual=x
out=self.conv1(x)
out=self.bn1(out)
out=self.relu(out)
out=self.conv2(out)
out=self.bn2(out)
if self.downsample is not None:
residual=self.downsample(x)
out+=residual
out=self.relu(out)
return out
class Encoder(nn.Module):
def __init__(self,in_planes,out_planes,stride=1,padding=0,bias=False):
super(Encoder,self).__init__()
self.block1=BasicBlock(in_planes,out_planes,stride,padding,bias)
self.block2 = BasicBlock(out_planes, out_planes, 1, padding, bias)
def forward(self,x):
x=self.block1(x)
x=self.block2(x)
return x
class Decoder(nn.Module):
def __init__(self,in_planes,out_planes,kernel_size,stride=1,padding=0,output_padding=0,bias=False):
super(Decoder,self).__init__()
self.conv1=nn.Sequential(nn.Conv2d(in_planes,in_planes//4,1,1,0,bias=bias),
nn.BatchNorm2d(in_planes//4),
nn.ReLU(inplace=True))
self.tp_conv=nn.Sequential(
nn.ConvTranspose2d(in_planes//4,in_planes//4,kernel_size,stride,padding,output_padding,bias=bias),
nn.BatchNorm2d(in_planes // 4),
nn.ReLU(inplace=True))
self.conv2 = nn.Sequential(nn.Conv2d(in_planes//4, out_planes, 1, 1, 0, bias=bias),
nn.BatchNorm2d(out_planes),
nn.ReLU(inplace=True))
def forward(self,x):
x=self.conv1(x)
x=self.tp_conv(x)
x=self.conv2(x)
return x
class linknet(nn.Module):
def __init__(self,n_classes=12):
super(linknet,self).__init__()
base=models.resnet18(pretrained=True)
self.in_block=nn.Sequential(
base.conv1,
base.bn1,
base.relu,
base.maxpool
)
self.encoder1=base.layer1
self.encoder2=base.layer2
self.encoder3=base.layer3
self.encoder4=base.layer4
self.decoder1=Decoder(64,64,3,1,1,0)
self.decoder2 = Decoder(128, 64, 3, 2, 1, 1)
self.decoder3 = Decoder(256, 128, 3, 2, 1, 1)
self.decoder4 = Decoder(512, 256, 3, 2, 1, 1)
#classifier
self.tp_conv1=nn.Sequential(nn.ConvTranspose2d(64,32,3,2,1,1),
nn.BatchNorm2d(32),
nn.ReLU(inplace=True))
self.conv2=nn.Sequential(nn.Conv2d(32,32,3,1,1),
nn.BatchNorm2d(32),
nn.ReLU(inplace=True))
self.tp_conv2=nn.ConvTranspose2d(32,n_classes,2,2,0)
def forward(self,x):
#Initial block
x=self.in_block(x)
#Encoder blocks
e1=self.encoder1(x)
e2=self.encoder2(e1)
e3 = self.encoder3(e2)
e4 = self.encoder4(e3)
#Decoder blocks
d4=e3+self.decoder4(e4)
d3 = e2 + self.decoder3(d4)
d2 = e1 + self.decoder2(d3)
d1=x+self.decoder1(d2)
y=self.tp_conv1(d1)
y=self.conv2(y)
y=self.tp_conv2(y)
return y
if __name__ =='__main__':
inputs=torch.randn((1,3,352,480))
model=linknet(n_classes=12)
out=model(inputs)
print(out.size())
参考:B站深度之眼