1. 亮点
- 最小重投影loss-处理遮挡问题
- 全分辨率多尺度采样-减少视觉伪影
- auto-masking loss-忽略违反相机运动假设的像素
2. loss
2.1 重投影误差
def compute_reprojection_loss(self, pred, target):
"""Computes reprojection loss between a batch of predicted and target images
"""
abs_diff = torch.abs(target - pred)
l1_loss = abs_diff.mean(1, True)
if self.opt.no_ssim:
reprojection_loss = l1_loss
else:
ssim_loss = self.ssim(pred, target).mean(1, True)
reprojection_loss = 0.85 * ssim_loss + 0.15 * l1_loss
return reprojection_loss
- 每张照片内参K相同
- 采用双线性采样对原图进行采样,这种方法是局部可微的
outputs[("sample", frame_id, scale)] = pix_coords
outputs[("color", frame_id, scale)] = F.grid_sample(
inputs[("color", frame_id, source_scale)],
outputs[("sample", frame_id, scale)],
padding_mode="border",align_corners=False)
grid_sample采用双线性差值
3. 边缘感知平滑
def get_smooth_loss(disp, img):
"""Computes the smoothness loss for a disparity image
The color image is used for edge-aware smoothness
"""
grad_disp_x = torch.abs(disp[:, :, :, :-1] - disp[:, :, :, 1:])
grad_disp_y = torch.abs(disp[:, :, :-1, :] - disp[:, :, 1:, :])
grad_img_x = torch.mean(torch.abs(img[:, :, :, :-1] - img[:, :, :, 1:]), 1, keepdim=True)
grad_img_y = torch.mean(torch.abs(img[:, :, :-1, :] - img[:, :, 1:, :]), 1, keepdim=True)
grad_disp_x *= torch.exp(-grad_img_x)
grad_disp_y *= torch.exp(-grad_img_y)
return grad_disp_x.mean() + grad_disp_y.mean()
3. 改进步骤
3.1 最小重投影误差
3.2 auto-masking
在loss中对每个像素使用mask u,对每个像素进行加权。其中u是二进制的,在0,1之间。并且在网络前向传播时自动计算。
序列中相邻帧之间保持相同的像素通常表示相机静止,目标与相机运动速度相同,或者低纹理区域。
if not self.opt.disable_automasking:
identity_reprojection_losses = []
for frame_id in self.opt.frame_ids[1:]:
pred = inputs[("color", frame_id, source_scale)]
identity_reprojection_losses.append(
self.compute_reprojection_loss(pred, target))
identity_reprojection_losses = torch.cat(identity_reprojection_losses, 1)
if self.opt.avg_reprojection:
identity_reprojection_loss = identity_reprojection_losses.mean(1, keepdim=True)
else:
# save both images, and do min all at once below
identity_reprojection_loss = identity_reprojection_losses
if not self.opt.disable_automasking:
# add random numbers to break ties
identity_reprojection_loss += torch.randn(
identity_reprojection_loss.shape, device=self.device) * 0.00001
combined = torch.cat((identity_reprojection_loss, reprojection_loss), dim=1)
else:
combined = reprojection_loss
3.3 多尺度估计
3.4 网络优化
- 输出端使用Sigmoid,在其他地方使用ELU非线性
class ConvBlock(nn.Module):
"""Layer to perform a convolution followed by ELU
"""
def __init__(self, in_channels, out_channels):
super(ConvBlock, self).__init__()
self.conv = Conv3x3(in_channels, out_channels)
self.nonlin = nn.ELU(inplace=True)
def forward(self, x):
out = self.conv(x)
out = self.nonlin(out)
return out
def forward(self, input_features):
self.outputs = {}
# decoder
# print(input_features.shape)
x = input_features[-1]
for i in range(4, -1, -1):
x = self.convs[("upconv", i, 0)](x)
x = [upsample(x)]
if self.use_skips and i > 0:
x += [input_features[i - 1]]
x = torch.cat(x, 1)
x = self.convs[("upconv", i, 1)](x)
if i in self.scales:
self.outputs[("disp", i)] = self.sigmoid(self.convs[("dispconv", i)](x))
return self.outputs
- 使用D=1/(aσ+b)将深度图输出σ S形投影到深度,其中选择a和b将D限制在0.1到100个单位之间。
def disp_to_depth(disp, min_depth, max_depth):
"""Convert network's sigmoid output into depth prediction
The formula for this conversion is given in the 'additional considerations'
section of the paper.
"""
min_disp = 1 / max_depth
max_disp = 1 / min_depth
scaled_disp = min_disp + (max_disp - min_disp) * disp
depth = 1 / scaled_disp
return scaled_disp, depth
- 在解码器中使用反射填充代替零填充,当样本到达图像边界之外时,返回源图像中最近边界像素的值。显著减少了现有方法中发现的边界伪影。
class Conv3x3(nn.Module):
"""Layer to pad and convolve input
"""
def __init__(self, in_channels, out_channels, use_refl=True):
super(Conv3x3, self).__init__()
if use_refl:
self.pad = nn.ReflectionPad2d(1)
else:
self.pad = nn.ZeroPad2d(1)
self.conv = nn.Conv2d(int(in_channels), int(out_channels), 3)
def forward(self, x):
out = self.pad(x)
out = self.conv(out)
return out
- 对于单目训练,使用三帧的序列长度,pose network 是由一个ResNet18组成的,修改为接受一对彩色图像(或六个通道)作为输入,并预测单个6自由度相对姿势