这篇文章主要有两点,SPP与3d卷积
3.1 Network Architecture
一般的网络第一个卷积都是用的7×7卷积,因为感受野要大,可是这样子的话计算复杂度较大,本文用三个串联的3×3的卷积去做这样的事情,同样的感受野,计算量大大降低,可以一试。conv0~conv4是残差块用来提取unary feature的。后面就是SPP。用来提取文本信息。再concat左右特征到cost volume当中,后续通过3d卷积来regularize。最后输出的视差映射是通过回归的方式得到。这个表可以详细描述
算法的图如下:
左右图输入到权值共享的CNN当中从而进行了特征提取。一个SPPmodule从而concat不同尺度的子区域特征代表。然后左右图特征再构成一个4d的cost volume。最后输入到3D卷积当中进行cost volume 的regulatization以及最终的视差估计。
特征提取层的对应代码如下:
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride, downsample, pad, dilation):
super(BasicBlock, self).__init__()
self.conv1 = nn.Sequential(convbn(inplanes, planes, 3, stride, pad, dilation),
nn.ReLU(inplace=True))
self.conv2 = convbn(planes, planes, 3, 1, pad, dilation)
self.downsample = downsample
self.stride = stride
def forward(self, x):
out = self.conv1(x)
out = self.conv2(out)
if self.downsample is not None:
x = self.downsample(x)
out += x
return out
self.inplanes = 32
self.firstconv = nn.Sequential(convbn(3, 32, 3, 2, 1, 1),
nn.ReLU(inplace=True),
convbn(32, 32, 3, 1, 1, 1),
nn.ReLU(inplace=True),
convbn(32, 32, 3, 1, 1, 1),
nn.ReLU(inplace=True))
self.layer1 = self._make_layer(BasicBlock, 32, 3, 1,1,1)
self.layer2 = self._make_layer(BasicBlock, 64, 16, 2,1,1)
self.layer3 = self._make_layer(BasicBlock, 128, 3, 1,1,1)
self.layer4 = self._make_layer(BasicBlock, 128, 3, 1,1,2)
def _make_layer(self, block, planes, blocks, stride, pad, dilation):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes * block.expansion),)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample, pad, dilation))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes,1,None,pad,dilation))
return nn.Sequential(*layers)
3.2 Spatial Pyramid Pooling Module
单凭像素强度决定文本内容关系是非常困难的。因此,饱含大量物体文本信息的图像特征会对视差估计很有用处。特别对于ill-posed区域。比如物体与子区域的区域关系是能够被SPPmodule学习得到同时包含融合在阶级文本信息中的。
SPP的存在是用来消除CNN当中对于尺度固定的偏见的。SPP产生不同level的特征map将会一起合并并送入全连接层中。SPPmodule使用自适应均值池化来将特征压缩到四个尺度中随后接着一个1×1的卷积来减少特征维度。随后通过双线性插值将特征尺度upsample到与原始特征尺度一样大。最后不同level的特征映射concated诚最终的SPP特征映射。
本文中也是用四个固定尺度的平均池化模块:64,32,16,以及8、后面接一个1×1的卷积,还有upsample。
代码如下:
平均池化+卷积
self.branch1 = nn.Sequential(nn.AvgPool2d((64, 64), stride=(64,64)),
convbn(128, 32, 1, 1, 0, 1),
nn.ReLU(inplace=True))
self.branch2 = nn.Sequential(nn.AvgPool2d((32, 32), stride=(32,32)),
convbn(128, 32, 1, 1, 0, 1),
nn.ReLU(inplace=True))
self.branch3 = nn.Sequential(nn.AvgPool2d((16, 16), stride=(16,16)),
convbn(128, 32, 1, 1, 0, 1),
nn.ReLU(inplace=True))
self.branch4 = nn.Sequential(nn.AvgPool2d((8, 8), stride=(8,8)),
convbn(128, 32, 1, 1, 0, 1),
nn.ReLU(inplace=True))
self.lastconv = nn.Sequential(convbn(320, 128, 3, 1, 1, 1),
nn.ReLU(inplace=True),
nn.Conv2d(128, 32, kernel_size=1, padding=0, stride = 1, bias=False))
上采样,然后concat
output_branch1 = self.branch1(output_skip)
output_branch1 = F.upsample(output_branch1, (output_skip.size()[2],output_skip.size()[3]),mode='bilinear')
output_branch2 = self.branch2(output_skip)
output_branch2 = F.upsample(output_branch2, (output_skip.size()[2],output_skip.size()[3]),mode='bilinear')
output_branch3 = self.branch3(output_skip)
output_branch3 = F.upsample(output_branch3, (output_skip.size()[2],output_skip.size()[3]),mode='bilinear')
output_branch4 = self.branch4(output_skip)
output_branch4 = F.upsample(output_branch4, (output_skip.size()[2],output_skip.size()[3]),mode='bilinear')
output_feature = torch.cat((output_raw, output_skip, output_branch4, output_branch3, output_branch2, output_branch1), 1)
output_feature = self.lastconv(output_feature)
3.3 Cost Voulme
concat左右特征成为一个4D向量(height×width×disparity×feature size)
cost = Variable(torch.FloatTensor(refimg_fea.size()[0], refimg_fea.size()[1]*2, self.maxdisp/4, refimg_fea.size()[2], refimg_fea.size()[3]).zero_()).cuda()
for i in range(self.maxdisp/4):
if i > 0 :
cost[:, :refimg_fea.size()[1], i, :,i:] = refimg_fea[:,:,:,i:]
cost[:, refimg_fea.size()[1]:, i, :,i:] = targetimg_fea[:,:,:,:-i]
else:
cost[:, :refimg_fea.size()[1], i, :,:] = refimg_fea
cost[:, refimg_fea.size()[1]:, i, :,:] = targetimg_fea
cost = cost.contiguous()
3.4 3D CNN
SPP module 通过包含不同level的特征来使得stereo matching更加的好~为了聚合特征信息与代表空间尺度的disparity 维度。我们提供了两种3D CNN 结构用作cost volume regularization:basic和stacked hourglass 结构。
basic结构就是简单的由residual block块构成。最后通过双线性插值upsample这个cost volume到H×W×D的size上。最后通过回归的方式计算H×W的视差图。
为了更好的学习内容信息,提出了一个stacked hourglass结构。主要有三个hourglass网络,每个都会产生一个视差图。有三个loss。更加高级一点
super(PSMNet, self).__init__()
self.maxdisp = maxdisp
self.feature_extraction = feature_extraction()
self.dres0 = nn.Sequential(convbn_3d(64, 32, 3, 1, 1),
nn.ReLU(inplace=True),
convbn_3d(32, 32, 3, 1, 1),
nn.ReLU(inplace=True))
self.dres1 = nn.Sequential(convbn_3d(32, 32, 3, 1, 1),
nn.ReLU(inplace=True),
convbn_3d(32, 32, 3, 1, 1))
self.dres2 = hourglass(32)
self.dres3 = hourglass(32)
self.dres4 = hourglass(32)
self.classif1 = nn.Sequential(convbn_3d(32, 32, 3, 1, 1),
nn.ReLU(inplace=True),
nn.Conv3d(32, 1, kernel_size=3, padding=1, stride=1,bias=False))
self.classif2 = nn.Sequential(convbn_3d(32, 32, 3, 1, 1),
nn.ReLU(inplace=True),
nn.Conv3d(32, 1, kernel_size=3, padding=1, stride=1,bias=False))
self.classif3 = nn.Sequential(convbn_3d(32, 32, 3, 1, 1),
nn.ReLU(inplace=True),
nn.Conv3d(32, 1, kernel_size=3, padding=1, stride=1,bias=False))
cost0 = self.dres0(cost)
cost0 = self.dres1(cost0) + cost0
out1, pre1, post1 = self.dres2(cost0, None, None)
out1 = out1+cost0
out2, pre2, post2 = self.dres3(out1, pre1, post1)
out2 = out2+cost0
out3, pre3, post3 = self.dres4(out2, pre1, post2)
out3 = out3+cost0
cost1 = self.classif1(out1)
cost2 = self.classif2(out2) + cost1
cost3 = self.classif3(out3) + cost2
if self.training:
cost1 = F.upsample(cost1, [self.maxdisp,left.size()[2],left.size()[3]], mode='trilinear')
cost2 = F.upsample(cost2, [self.maxdisp,left.size()[2],left.size()[3]], mode='trilinear')
cost1 = torch.squeeze(cost1,1)
pred1 = F.softmax(cost1,dim=1)
pred1 = disparityregression(self.maxdisp)(pred1)
cost2 = torch.squeeze(cost2,1)
pred2 = F.softmax(cost2,dim=1)
pred2 = disparityregression(self.maxdisp)(pred2)
cost3 = F.upsample(cost3, [self.maxdisp,left.size()[2],left.size()[3]], mode='trilinear')
cost3 = torch.squeeze(cost3,1)
pred3 = F.softmax(cost3,dim=1)
pred3 = disparityregression(self.maxdisp)(pred3)
if self.training:
return pred1, pred2, pred3
else:
return pred3
3.5 Dispaity Regression
用的是GC-Net的argminsoftmax
class disparityregression(nn.Module):
def __init__(self, maxdisp):
super(disparityregression, self).__init__()
self.disp = Variable(torch.Tensor(np.reshape(np.array(range(maxdisp)),[1,maxdisp,1,1])).cuda(), requires_grad=False)
def forward(self, x):
disp = self.disp.repeat(x.size()[0],1,x.size()[2],x.size()[3])
out = torch.sum(x*disp,1)
return out
3.6 Loss
最终的loss是smooth的loss
if args.model == 'stackhourglass':
output1, output2, output3 = model(imgL,imgR)
output1 = torch.squeeze(output1,1)
output2 = torch.squeeze(output2,1)
output3 = torch.squeeze(output3,1)
loss = 0.5*F.smooth_l1_loss(output1[mask], disp_true[mask], size_average=True) + 0.7*F.smooth_l1_loss(output2[mask], disp_true[mask], size_average=True) + F.smooth_l1_loss(output3[mask], disp_true[mask], size_average=True)
elif args.model == 'basic':
output = model(imgL,imgR)
output = torch.squeeze(output3,1)
loss = F.smooth_l1_loss(output3[mask], disp_true[mask], size_average=True)
最后训练下看下效果吧