一、模型概览
DeepLabv3+由Encoder与Decoder两部分构成。Encoder主要包括backbone(骨架/底模)和ASPP,及对ASPP输出的降维。backbone可以使用ResNet、Xception等。
二、ASPP
简单来说,ASPP将backbone提取出的特征图输入多个平行且不同的层(如卷积层、空洞卷积层、池化层),然后将获得的多个输出拼接。
class ASPP(nn.Module):
def __init__(self, in_channels=2048):
super(ASPP, self).__init__()
self.conv1 = ConvLayer(in_channels, 256, 1, padding=0)
# rate = 6
self.conv2 = ConvLayer(in_channels, 256, 3, padding=6, dilation=6)
# rate = 12
self.conv3 = ConvLayer(in_channels, 256, 3, padding=12, dilation=12)
# rate = 18
self.conv4 = ConvLayer(in_channels, 256, 3, padding=18, dilation=18)
# image pooling
self.pooling = nn.AdaptiveMaxPool2d((1, 1))
self.conv5 = ConvLayer(in_channels, 256, 1, padding=0)
# extract feature from ASPP output
self.conv6 = ConvLayer(256 * 5, 256, 1, padding=0)
def forward(self, x):
o1 = self.conv1(x)
o2 = self.conv2(x)
o3 = self.conv3(x)
o4 = self.conv4(x)
o5 = self.pooling(x)
o5 = self.conv5(o5)
o5 = F.interpolate(o5, scale_factor=x.shape[-1], mode='bilinear')
o = torch.cat((o1, o2, o3, o4, o5), dim=1)
o = self.conv6(o)
return o
- 图中rate = 空洞卷积的dilation
- Image Pooling由池化层、卷积层、上采样构成
- 顺带包括将其输出降维的步骤
三、Decoder
- Decoder使用low-level特征和ASPP输出降维后的特征。
- 使用卷积核大小为1、步长为1的卷积层减少low-level特征的通道数。
- 对ASPP输出降维后的特征使用双线性插值来上采样。
- 拼接卷积层的输出与上采样的输出,将其输入卷积核大小为3、步长为1、填充为1的卷积层进行预测,然后将结果上采样至原始图像大小。
参数
- num_classes:种类数量
- low_channels:low-level特征通道数
- low_multiple: low-level特征尺寸比encoder最后一个输出的尺寸的倍数
- multiple: 原始图像尺寸比low-level特征尺寸的倍数
class DeepLabHead(nn.Module):
def __init__(self, num_classes, low_channels=64, low_multiple=2, multiple=4):
super(DeepLabHead, self).__init__()
# reduce channels of low-level features
self.conv1 = ConvLayer(low_channels, 48, 1, padding=0)
# classifier
self.conv2 = nn.Conv2d(48 + 256, num_classes, 3, padding=1)
# low-level features size / size of the last output of encoder
self.low_multiple = low_multiple
# image size / low-level features size
self.multiple = multiple
def forward(self, *x):
low_o = self.conv1(x[0])
o = F.interpolate(x[1], scale_factor=self.low_multiple, mode='bilinear')
o = torch.cat((low_o, o), dim=1)
o = self.conv2(o)
o = F.interpolate(o, scale_factor=self.multiple, mode='bilinear')
return o
四、DeepLab-ResNet101示例
1、ResNet
backbone所用的的ResNet与普通ResNet不同之处是后两个残差块层的残差块使用了空洞卷积。在forward函数中,需要缓存low-level特征与特征图。
class ResNet(Base):
def __init__(self, blocks_num: list[int], dropout=0.5):
super(ResNet, self).__init__(2048)
self.block = Bottleneck
# preprocessing layer
self.pl = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(3, 3), stride=2, padding=1)
)
# blocks
self.blocks1 = self.make_blocks(64, 64, blocks_num[0])
self.blocks2 = self.make_blocks(64 * self.block.expansion, 128, blocks_num[1])
self.blocks3 = self.make_blocks(128 * self.block.expansion, 256, blocks_num[2], dilation=2)
self.blocks4 = self.make_blocks(256 * self.block.expansion, 512, blocks_num[3], dilation=4)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
os = []
o = self.pl(x)
os.append(o)
o = self.blocks1(o)
o = self.blocks2(o)
o = self.blocks3(o)
o = self.blocks4(o)
o = self.dropout(o)
os.append(o)
return os
def make_blocks(self, in_channels, size, num, dilation=1):
blocks = nn.Sequential()
if in_channels == size or dilation > 1:
shortcut = Shortcut(in_channels, size * self.block.expansion, 1)
stride = 1
else:
shortcut = Shortcut(in_channels, size * self.block.expansion, 2)
stride = 2
blocks.append(self.block(in_channels, size, stride=stride,
shortcut=shortcut, dilation=dilation))
for _ in range(1, num):
blocks.append(self.block(size * self.block.expansion, size, dilation=dilation))
return blocks
使用空洞卷积的Bottleneck与普通Bottleneck不同之处在于原可承担下采样的卷积层也可承担空洞卷积,但是两个任务互斥。也就是说,进行下采样时不会使用空洞卷积,使用空洞卷积时不会进行下采样。
class Shortcut(nn.Sequential):
def __init__(self, in_channels, out_channels, stride):
layers = [nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride),
nn.BatchNorm2d(out_channels)]
super(Shortcut, self).__init__(*layers)
class Bottleneck(nn.Module):
expansion: int = 4
def __init__(self, in_channels, size, stride=1, dilation=1, shortcut: nn.Module = None):
super(Bottleneck, self).__init__()
out_channels = size * self.expansion
# first layer
self.conv1 = ConvLayer(in_channels, size, kernel_size=1, padding=0)
# second layer
self.conv2 = ConvLayer(size, size, kernel_size=3, stride=stride, dilation=dilation, padding=dilation)
# third layer
self.conv3 = nn.Conv2d(size, out_channels, kernel_size=1, stride=1)
self.bn3 = nn.BatchNorm2d(out_channels)
# shortcut connection
self.shortcut = nn.Identity() if shortcut is None else shortcut
self.bn4 = nn.BatchNorm2d(out_channels)
self.relu4 = nn.ReLU()
def forward(self, x):
o = self.conv1(x)
o = self.conv2(o)
o = self.conv3(o)
o = self.bn3(o)
o = o + self.shortcut(x) # F(x)+x
o = self.bn4(o)
o = self.relu4(o)
return o
更多ResNet信息,请移步http://t.csdnimg.cn/7Bo23 。
2、DeepLab
class DeepLab(nn.Module):
def __init__(self, base: Base, head: nn.Module):
super(DeepLab, self).__init__()
# encoder
self.base = base # feature extraction
self.aspp = ASPP(self.base.out_channels)
# decoder
self.head = head
def forward(self, x):
os = self.base(x)
os[1] = self.aspp(os[1])
o = self.head(*os)
return o
3、一些之前代码用到的自定义类
class Base(nn.Module):
def __init__(self, out_channels=2048):
super(Base, self).__init__()
self.out_channels = out_channels
class ConvLayer(nn.Sequential):
def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, groups=1, dilation=1):
layers = [nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding=padding,
stride=stride, groups=groups, dilation=dilation),
nn.BatchNorm2d(out_channels),
nn.ReLU()]
super(ConvLayer, self).__init__(*layers)
五、友情提示
在未对底模进行预训练的情况下, DeepLab训练效率可能较低(如下图,模型为DeepLab-ResNet101)。建议先让底模搭配图像分类任务的头部通过图像分类任务进行预训练,然后将其参数迁移至DeepLab 。