代码学习笔记
Unsupervised Monocular Depth Estimation with Left-Right Consistency :monodepth_mode.py
源码:monodepth
源模型网址:model_download
感谢在学习中参考的优秀博文
上图为整体的网络模型
1、在encoder 和 decoder的部分,作者用了 skip-connection 的结构。
2、上采样结构,作者用插值上采样再卷积的方法,替换反卷积。(信息扩增)
3、在decoder部分的最外面4层,作者都估计了当前特征大小所对应的视差的值,并且将它上采样后传递给了decoder的下层,这样能确保每一层都在做提取disparity这件事,同时也相当于做了一个coarse-to-fine的深度预测。
model_architecture
loss
total_loss
Appearance Matching Loss:图片的重建损失函数
Left-Right Disparity Consistency Loss:左右一致视差,dl和dr之间也存在着与原图相等的关系,也就是说
经过上面的式子,可以理解:(后面的式子,在源码中是以bilinear_sampling(Dr,Dl)实现。)
Disparity Smoothness Loss:保证深度图的光滑,和图像的梯度一致性
第三部分代码注释:monodepth_model.py
"""
代码学习 注释专用
song
stay hungry stay foolish
"""
from __future__ import absolute_import, division, print_function # 绝对引入
"""Fully convolutional model for monocular depth estimation
模型结果下载地址: http://visual.cs.ucl.ac.uk/pubs/monoDepth/
"""
from collections import namedtuple # 用来创建一个与 tuple 类似的对象,而对象拥有可以访问的属性
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim
from bilinear_sampler import * # 双线性采样 通过双线性插值的方法来 mapping 出对应的图
monodepth_parameters = namedtuple('parameters',
'encoder, '
'height, width, '
'batch_size, '
'num_threads, '
'num_epochs, '
'do_stereo, '
'wrap_mode, '
'use_deconv, '
'alpha_image_loss, '
'disp_gradient_loss_weight, '
'lr_loss_weight, '
'full_summary')
class MonodepthModel(object):
"""monodepth model"""
def __init__(self, params, mode, left, right, reuse_variables=None, model_index=0):
self.params = params
self.mode = mode
self.left = left
self.right = right
self.model_collection = ['model_' + str(model_index)]
self.reuse_variables = reuse_variables
self.build_model()
self.build_outputs()
if self.mode == 'test':
return
self.build_losses()
self.build_summaries()
def gradient_x(self, img):
gx = img[:,:,:-1,:] - img[:,:,1:,:]
"""
-1 代表最后一个位置的索引,1 代表第二个位置的索引
也就是取第三维的0 到 n-1 和 第三维的 1 - n
再相减 得到的是 [None, None, None-1, None]维的矩阵 水平方向
"""
return gx
def gradient_y(self, img):
gy = img[:,:-1,:,:] - img[:,1:,:,:]
"""
同上,得到的是 [None, Non-1, None, None]维的矩阵 垂直方向
"""
return gy
def upsample_nn(self, x, ratio): # 上采样
s = tf.shape(x)
h = s[1]
w = s[2]
return tf.image.resize_nearest_neighbor(x, [h * ratio, w * ratio])
# 上采样 tensor x 变成 [h * ratio, w * ratio] 的大小 , 最近邻插值方法
def scale_pyramid(self, img, num_scales):
scaled_imgs = [img]
s = tf.shape(img)
h = s[1]
w = s[2]
for i in range(num_scales - 1):
ratio = 2 ** (i + 1)
nh = h // ratio
nw = w // ratio
scaled_imgs.append(tf.image.resize_area(img, [nh, nw]))
"""
用金字塔输入,即一张输入图片,通过金字塔网络得到[I, I/2, I/4, I/8],在python里这就是一个list,
用for语句可以很方便的同时计算4个尺寸的训练损失函数。
所以经过图像金字塔后,输出的是个 四维 的数组
"""
"""
图像金字塔:
就是将图像进行一层一层的下采样,图像金字塔是为了构建图像的多尺度,让模型能够更好的适应图像的尺度变化
后续可以进行图像 尺度空间分析 和 多分辨率分析
"""
return scaled_imgs
def generate_image_left(self, img, disp):
return bilinear_sampler_1d_h(img, -disp) # 通过双线性插值的方法的来获取图片
def generate_image_right(self, img, disp):
return bilinear_sampler_1d_h(img, disp)
def SSIM(self, x, y): # SSIM结构相似性,是衡量两张图片是否相似的重要指标。是 Appearance Matching Loss 中的约束部分
C1 = 0.01 ** 2
C2 = 0.03 ** 2
mu_x = slim.avg_pool2d(x, 3, 1, 'VALID')
mu_y = slim.avg_pool2d(y, 3, 1, 'VALID')
sigma_x = slim.avg_pool2d(x ** 2, 3, 1, 'VALID') - mu_x ** 2
sigma_y = slim.avg_pool2d(y ** 2, 3, 1, 'VALID') - mu_y ** 2
sigma_xy = slim.avg_pool2d(x * y , 3, 1, 'VALID') - mu_x * mu_y
SSIM_n = (2 * mu_x * mu_y + C1) * (2 * sigma_xy + C2)
SSIM_d = (mu_x ** 2 + mu_y ** 2 + C1) * (sigma_x + sigma_y + C2)
SSIM = SSIM_n / SSIM_d
return tf.clip_by_value((1 - SSIM) / 2, 0, 1) # 结果归一化
def get_disparity_smoothness(self, disp, pyramid): # 构建 Disparity Smoothness Loss 的部分约束
disp_gradients_x = [self.gradient_x(d) for d in disp]
disp_gradients_y = [self.gradient_y(d) for d in disp]
image_gradients_x = [self.gradient_x(img) for img in pyramid]
image_gradients_y = [self.gradient_y(img) for img in pyramid]
weights_x = [tf.exp(-tf.reduce_mean(tf.abs(g), 3, keep_dims=True)) for g in image_gradients_x]
weights_y = [tf.exp(-tf.reduce_mean(tf.abs(g), 3, keep_dims=True)) for g in image_gradients_y]
smoothness_x = [disp_gradients_x[i] * weights_x[i] for i in range(4)]
smoothness_y = [disp_gradients_y[i] * weights_y[i] for i in range(4)]
return smoothness_x + smoothness_y
def get_disp(self, x):
disp = 0.3 * self.conv(x, 2, 3, 1, tf.nn.sigmoid)
return disp
def conv(self, x, num_out_layers, kernel_size, stride, activation_fn=tf.nn.elu): # padding卷积
p = np.floor((kernel_size - 1) / 2).astype(np.int32) # floor 向下取整 astype 数据类型转换
"""通过padding, 保持卷积后的大小与input的大小相同"""
p_x = tf.pad(x, [[0, 0], [p, p], [p, p], [0, 0]]) # padding 对 二、三 维进行补0
return slim.conv2d(p_x, num_out_layers, kernel_size, stride, 'VALID', activation_fn=activation_fn)
"""
inputs:需要做卷积的输入图像
num_outputs:指定卷积核的个数(就是filter的个数)
kernel_size:用于指定卷积核的维度 可以是单个整数,代表所有维度为相同的值
stride:为卷积时在图像每一维的步长
padding为padding的方式选择,VALID或者SAME
same : 输出大小等于输入大小除以步长向上取整,s是步长大小;
valid : 输出大小等于输入大小减去滤波器大小加上1,最后再除以步长(f为滤波器的大小,s是步长大小)
data_format是用于指定输入的input的格式
activation_fn 激活函数 默认RELU
"""
def conv_block(self, x, num_out_layers, kernel_size): # 一个卷积层里的两次卷积
conv1 = self.conv(x, num_out_layers, kernel_size, 1)
conv2 = self.conv(conv1, num_out_layers, kernel_size, 2)
return conv2
def maxpool(self, x, kernel_size): # 池化
p = np.floor((kernel_size - 1) / 2).astype(np.int32)
p_x = tf.pad(x, [[0, 0], [p, p], [p, p], [0, 0]])
return slim.max_pool2d(p_x, kernel_size) # maxpooling 步长为 2
def resconv(self, x, num_layers, stride):
do_proj = tf.shape(x)[3] != num_layers or stride == 2
shortcut = []
conv1 = self.conv(x, num_layers, 1, 1)
conv2 = self.conv(conv1, num_layers, 3, stride)
conv3 = self.conv(conv2, 4 * num_layers, 1, 1, None)
if do_proj:
shortcut = self.conv(x, 4 * num_layers, 1, stride, None)
else:
shortcut = x
return tf.nn.elu(conv3 + shortcut)
def resblock(self, x, num_layers, num_blocks):
out = x
for i in range(num_blocks - 1):
out = self.resconv(out, num_layers, 1)
out = self.resconv(out, num_layers, 2)
return out
def upconv(self, x, num_out_layers, kernel_size, scale): # 这里选用了插值上采样,再卷积,替换了反卷积,(某种意义上的信息扩增)
upsample = self.upsample_nn(x, scale)
conv = self.conv(upsample, num_out_layers, kernel_size, 1)
return conv
def deconv(self, x, num_out_layers, kernel_size, scale):
p_x = tf.pad(x, [[0, 0], [1, 1], [1, 1], [0, 0]])
conv = slim.conv2d_transpose(p_x, num_out_layers, kernel_size, scale, 'SAME')
return conv[:,3:-1,3:-1,:]
def build_vgg(self): # 定义VGG网络
# set convenience functions
conv = self.conv
if self.params.use_deconv:
upconv = self.deconv
else:
upconv = self.upconv
with tf.variable_scope('encoder'): # Convolution
conv1 = self.conv_block(self.model_input, 32, 7) # H/2
conv2 = self.conv_block(conv1, 64, 5) # H/4
conv3 = self.conv_block(conv2, 128, 3) # H/8
conv4 = self.conv_block(conv3, 256, 3) # H/16
conv5 = self.conv_block(conv4, 512, 3) # H/32
conv6 = self.conv_block(conv5, 512, 3) # H/64
conv7 = self.conv_block(conv6, 512, 3) # H/128
with tf.variable_scope('skips'):
skip1 = conv1
skip2 = conv2
skip3 = conv3
skip4 = conv4
skip5 = conv5
skip6 = conv6
with tf.variable_scope('decoder'): # Up-Convolution
upconv7 = upconv(conv7, 512, 3, 2) # H/64
concat7 = tf.concat([upconv7, skip6], 3)
iconv7 = conv(concat7, 512, 3, 1)
upconv6 = upconv(iconv7, 512, 3, 2) # H/32
concat6 = tf.concat([upconv6, skip5], 3)
iconv6 = conv(concat6, 512, 3, 1)
upconv5 = upconv(iconv6, 256, 3, 2) # H/16
concat5 = tf.concat([upconv5, skip4], 3)
iconv5 = conv(concat5, 256, 3, 1)
upconv4 = upconv(iconv5, 128, 3, 2) # H/8
concat4 = tf.concat([upconv4, skip3], 3)
iconv4 = conv(concat4, 128, 3, 1)
self.disp4 = self.get_disp(iconv4)
udisp4 = self.upsample_nn(self.disp4, 2) # up-sampling
upconv3 = upconv(iconv4, 64, 3, 2) # H/4
concat3 = tf.concat([upconv3, skip2, udisp4], 3)
iconv3 = conv(concat3, 64, 3, 1)
self.disp3 = self.get_disp(iconv3)
udisp3 = self.upsample_nn(self.disp3, 2)
upconv2 = upconv(iconv3, 32, 3, 2) # H/2
concat2 = tf.concat([upconv2, skip1, udisp3], 3)
iconv2 = conv(concat2, 32, 3, 1)
self.disp2 = self.get_disp(iconv2)
udisp2 = self.upsample_nn(self.disp2, 2)
upconv1 = upconv(iconv2, 16, 3, 2) # H
concat1 = tf.concat([upconv1, udisp2], 3)
iconv1 = conv(concat1, 16, 3, 1)
self.disp1 = self.get_disp(iconv1)
def build_resnet50(self): # 定义resnet50网络
# set convenience functions
conv = self.conv
if self.params.use_deconv:
upconv = self.deconv
else:
upconv = self.upconv
with tf.variable_scope('encoder'):
conv1 = conv(self.model_input, 64, 7, 2) # H/2 - 64D
pool1 = self.maxpool(conv1, 3) # H/4 - 64D
conv2 = self.resblock(pool1, 64, 3) # H/8 - 256D
conv3 = self.resblock(conv2, 128, 4) # H/16 - 512D
conv4 = self.resblock(conv3, 256, 6) # H/32 - 1024D
conv5 = self.resblock(conv4, 512, 3) # H/64 - 2048D
with tf.variable_scope('skips'):
skip1 = conv1
skip2 = pool1
skip3 = conv2
skip4 = conv3
skip5 = conv4
# DECODING 这里使用了 skip-connection (输出表述为输入和输入的一个非线性变换的线性叠加)为了在细节上处理的更好
with tf.variable_scope('decoder'):
upconv6 = upconv(conv5, 512, 3, 2) #H/32
concat6 = tf.concat([upconv6, skip5], 3)
iconv6 = conv(concat6, 512, 3, 1)
upconv5 = upconv(iconv6, 256, 3, 2) #H/16
concat5 = tf.concat([upconv5, skip4], 3)
iconv5 = conv(concat5, 256, 3, 1)
upconv4 = upconv(iconv5, 128, 3, 2) #H/8
concat4 = tf.concat([upconv4, skip3], 3)
iconv4 = conv(concat4, 128, 3, 1)
self.disp4 = self.get_disp(iconv4)
udisp4 = self.upsample_nn(self.disp4, 2)
upconv3 = upconv(iconv4, 64, 3, 2) #H/4
concat3 = tf.concat([upconv3, skip2, udisp4], 3)
iconv3 = conv(concat3, 64, 3, 1)
self.disp3 = self.get_disp(iconv3)
udisp3 = self.upsample_nn(self.disp3, 2)
upconv2 = upconv(iconv3, 32, 3, 2) #H/2
concat2 = tf.concat([upconv2, skip1, udisp3], 3)
iconv2 = conv(concat2, 32, 3, 1)
self.disp2 = self.get_disp(iconv2)
udisp2 = self.upsample_nn(self.disp2, 2)
upconv1 = upconv(iconv2, 16, 3, 2) #H
concat1 = tf.concat([upconv1, udisp2], 3)
iconv1 = conv(concat1, 16, 3, 1)
self.disp1 = self.get_disp(iconv1) # 4个scale的输出
def build_model(self): # 定义整体模型
with slim.arg_scope([slim.conv2d, slim.conv2d_transpose], activation_fn=tf.nn.elu):
with tf.variable_scope('model', reuse=self.reuse_variables):
self.left_pyramid = self.scale_pyramid(self.left, 4) # 得到 left 图片的图像金字塔 left_pyramid 是一个四维的数组
if self.mode == 'train': # 训练时 的双目图片
self.right_pyramid = self.scale_pyramid(self.right, 4)
# 测试
if self.params.do_stereo: # stereo 时输入两张图片
self.model_input = tf.concat([self.left, self.right], 3)
else:
self.model_input = self.left # 其他情况仅有left作为输入
# build model 选择训练的网络
if self.params.encoder == 'vgg':
self.build_vgg()
elif self.params.encoder == 'resnet50':
self.build_resnet50()
else:
return None
def build_outputs(self): # 一次loop的输出
# STORE DISPARITIES
with tf.variable_scope('disparities'): # 生成 dr 和 dl
self.disp_est = [self.disp1, self.disp2, self.disp3, self.disp4]
self.disp_left_est = [tf.expand_dims(d[:,:,:,0], 3) for d in self.disp_est]
self.disp_right_est = [tf.expand_dims(d[:,:,:,1], 3) for d in self.disp_est]
if self.mode == 'test': # 测试 则返回
return
# GENERATE IMAGES
with tf.variable_scope('images'): # 通过上面生成的 dr 和 dl 生成图片 I`r 和 I`l (backward sampling)
self.left_est = [self.generate_image_left(self.right_pyramid[i], self.disp_left_est[i]) for i in range(4)]
self.right_est = [self.generate_image_right(self.left_pyramid[i], self.disp_right_est[i]) for i in range(4)]
# LR CONSISTENCY 实现左右一致性
with tf.variable_scope('left-right'):
self.right_to_left_disp = [self.generate_image_left(self.disp_right_est[i], self.disp_left_est[i]) for i in range(4)]
self.left_to_right_disp = [self.generate_image_right(self.disp_left_est[i], self.disp_right_est[i]) for i in range(4)]
# DISPARITY SMOOTHNESS
with tf.variable_scope('smoothness'):
self.disp_left_smoothness = self.get_disparity_smoothness(self.disp_left_est, self.left_pyramid)
self.disp_right_smoothness = self.get_disparity_smoothness(self.disp_right_est, self.right_pyramid)
def build_losses(self): # 定义loss
with tf.variable_scope('losses', reuse=self.reuse_variables):
# IMAGE RECONSTRUCTION
# L1
self.l1_left = [tf.abs( self.left_est[i] - self.left_pyramid[i]) for i in range(4)]
self.l1_reconstruction_loss_left = [tf.reduce_mean(l) for l in self.l1_left]
self.l1_right = [tf.abs(self.right_est[i] - self.right_pyramid[i]) for i in range(4)]
self.l1_reconstruction_loss_right = [tf.reduce_mean(l) for l in self.l1_right]
# SSIM
self.ssim_left = [self.SSIM( self.left_est[i], self.left_pyramid[i]) for i in range(4)]
self.ssim_loss_left = [tf.reduce_mean(s) for s in self.ssim_left]
self.ssim_right = [self.SSIM(self.right_est[i], self.right_pyramid[i]) for i in range(4)]
self.ssim_loss_right = [tf.reduce_mean(s) for s in self.ssim_right]
# WEIGTHED SUM
self.image_loss_right = [self.params.alpha_image_loss * self.ssim_loss_right[i] + (1 - self.params.alpha_image_loss) * self.l1_reconstruction_loss_right[i] for i in range(4)]
self.image_loss_left = [self.params.alpha_image_loss * self.ssim_loss_left[i] + (1 - self.params.alpha_image_loss) * self.l1_reconstruction_loss_left[i] for i in range(4)]
self.image_loss = tf.add_n(self.image_loss_left + self.image_loss_right)
# DISPARITY SMOOTHNESS
self.disp_left_loss = [tf.reduce_mean(tf.abs(self.disp_left_smoothness[i])) / 2 ** i for i in range(4)]
self.disp_right_loss = [tf.reduce_mean(tf.abs(self.disp_right_smoothness[i])) / 2 ** i for i in range(4)]
self.disp_gradient_loss = tf.add_n(self.disp_left_loss + self.disp_right_loss)
# LR CONSISTENCY loss
self.lr_left_loss = [tf.reduce_mean(tf.abs(self.right_to_left_disp[i] - self.disp_left_est[i])) for i in range(4)]
self.lr_right_loss = [tf.reduce_mean(tf.abs(self.left_to_right_disp[i] - self.disp_right_est[i])) for i in range(4)]
self.lr_loss = tf.add_n(self.lr_left_loss + self.lr_right_loss)
# TOTAL LOSS
self.total_loss = self.image_loss + self.params.disp_gradient_loss_weight * self.disp_gradient_loss + self.params.lr_loss_weight * self.lr_loss
def build_summaries(self):
# SUMMARIES
with tf.device('/cpu:0'):
for i in range(4):
tf.summary.scalar('ssim_loss_' + str(i), self.ssim_loss_left[i] + self.ssim_loss_right[i], collections=self.model_collection)
tf.summary.scalar('l1_loss_' + str(i), self.l1_reconstruction_loss_left[i] + self.l1_reconstruction_loss_right[i], collections=self.model_collection)
tf.summary.scalar('image_loss_' + str(i), self.image_loss_left[i] + self.image_loss_right[i], collections=self.model_collection)
tf.summary.scalar('disp_gradient_loss_' + str(i), self.disp_left_loss[i] + self.disp_right_loss[i], collections=self.model_collection)
tf.summary.scalar('lr_loss_' + str(i), self.lr_left_loss[i] + self.lr_right_loss[i], collections=self.model_collection)
tf.summary.image('disp_left_est_' + str(i), self.disp_left_est[i], max_outputs=4, collections=self.model_collection)
tf.summary.image('disp_right_est_' + str(i), self.disp_right_est[i], max_outputs=4, collections=self.model_collection)
if self.params.full_summary:
tf.summary.image('left_est_' + str(i), self.left_est[i], max_outputs=4, collections=self.model_collection)
tf.summary.image('right_est_' + str(i), self.right_est[i], max_outputs=4, collections=self.model_collection)
tf.summary.image('ssim_left_' + str(i), self.ssim_left[i], max_outputs=4, collections=self.model_collection)
tf.summary.image('ssim_right_' + str(i), self.ssim_right[i], max_outputs=4, collections=self.model_collection)
tf.summary.image('l1_left_' + str(i), self.l1_left[i], max_outputs=4, collections=self.model_collection)
tf.summary.image('l1_right_' + str(i), self.l1_right[i], max_outputs=4, collections=self.model_collection)
if self.params.full_summary:
tf.summary.image('left', self.left, max_outputs=4, collections=self.model_collection)
tf.summary.image('right', self.right, max_outputs=4, collections=self.model_collection)