来自密歇根大学的研究团队设计的Stacked hourglass network是一种专门用于人体姿态估计的网络结构,曾在MPII竞赛中暂列榜首,目前排名第七,排名在他之前的部分方法借鉴了hourglass的网络结构,并进行了改进,可以说hourglass的网络结构算是受到了业界的认可。
如下图所示,Stacked hourglass network可以翻译成堆叠沙漏网络,首先进行卷积池化处理,并进行多次下采样操作,获得一些分辨率较低的特征,从而使计算复杂度降低。为了使图像特征的分辨率上升,紧接着进行多次上采样。上采样操作使得图像的分辨率增高,同时更有能力预测物体的准确位置。通过这样一种处理,相较于其他网络,该网络结构能够通过增大感受野的操作来获得更大多的上下文信息。
每一个stack里包含一个hourglass,hourglass的结构如下图所示
每个hourglass里包含若干个residual,在上图的hourglass结构中为四个。residual的结构如下图所示,直线里的操作为先进行下采样再进行上采样,虚线里的操作可以为在原尺度下的卷积操作,也可以不做任何操作,最终对两条线的输出进行相加操作。
用TensorFlow实现hourglass:
首先定义一些基本的操作函数:
def conv_bn_relu(inputs, filters, kernel_size = 1, strides = 1, pad = 'VALID', name = 'conv_bn_relu'):
kernel = tf.Variable(tf.contrib.layers.xavier_initializer(uniform=False)([kernel_size,kernel_size, inputs.get_shape().as_list()[3], filters]), name= 'weights')
conv = tf.nn.conv2d(inputs, kernel, [1,strides,strides,1], padding='VALID', data_format='NHWC')
norm = tf.contrib.layers.batch_norm(conv, 0.9, epsilon=1e-5, activation_fn = tf.nn.relu, is_training = training)
return norm
def conv_block(inputs, numOut, name = 'conv_block'):
with tf.name_scope('norm_1'):
norm_1 = tf.contrib.layers.batch_norm(inputs, 0.9, epsilon=1e-5, activation_fn = tf.nn.relu, is_training = training)
conv_1 = conv2d(norm_1, int(numOut/2), kernel_size=1, strides=1, pad = 'VALID', name= 'conv')
with tf.name_scope('norm_2'):
norm_2 = tf.contrib.layers.batch_norm(conv_1, 0.9, epsilon=1e-5, activation_fn = tf.nn.relu, is_training = training)
pad = tf.pad(norm_2, np.array([[0,0],[1,1],[1,1],[0,0]]), name= 'pad')
conv_2 = conv2d(pad, int(numOut/2), kernel_size=3, strides=1, pad = 'VALID', name= 'conv')
with tf.name_scope('norm_3'):
norm_3 = tf.contrib.layers.batch_norm(conv_2, 0.9, epsilon=1e-5, activation_fn = tf.nn.relu, is_training = training)
conv_3 = conv2d(norm_3, int(numOut), kernel_size=1, strides=1, pad = 'VALID', name= 'conv')
return conv_3
def conv2d(inputs, filters, kernel_size = 1, strides = 1, pad = 'VALID', name = 'conv'):
kernel = tf.Variable(tf.contrib.layers.xavier_initializer(uniform=False)([kernel_size,kernel_size, inputs.get_shape().as_list()[3], filters]), name= 'weights')
conv = tf.nn.conv2d(inputs, kernel, [1,strides,strides,1], padding=pad, data_format='NHWC')
return conv
接着定义residual操作:
def residual(inputs, numOut, name = 'residual_block'):
convb = conv_block(inputs, numOut)
skipl = skip_layer(inputs, numOut)
return tf.add_n([convb, skipl], name = 'res_block')
def skip_layer(inputs, numOut, name = 'skip_layer'):
with tf.name_scope(name):
if inputs.get_shape().as_list()[3] == numOut:
return inputs
else:
conv = conv2d(inputs, numOut, kernel_size=1, strides = 1, name = 'conv')
return conv
然后是hourglass操作:
def hourglass(inputs, n, numOut, name = 'hourglass'):
with tf.name_scope(name):
# Upper Branch
up_1 = residual(inputs, numOut, name = 'up_1')
# Lower Branch
low_ = tf.contrib.layers.max_pool2d(inputs, [2,2], [2,2], padding='VALID')
low_1= residual(low_, numOut, name = 'low_1')
if n > 0:
low_2 = hourglass(low_1, n-1, numOut, name = 'low_2')
else:
low_2 = residual(low_1, numOut, name = 'low_2')
low_3 = residual(low_2, numOut, name = 'low_3')
# up_2 = tf.image.resize_nearest_neighbor(low_3, tf.shape(low_3)[1:3]*2, name = 'upsampling')
up_2 = tf.image.resize_nearest_neighbor(low_3, tf.shape(up_1)[1:3], name = 'upsampling')
# print up_2, up_1
return tf.add_n([up_2,up_1], name='out_hg')
最后是整个network:
def net(inputs):
# Input Dim : nbImages x 256 x 256 x 3
pad1 = tf.pad(inputs, [[0,0],[2,2],[2,2],[0,0]], name='pad_1')
# Dim pad1 : nbImages x 260 x 260 x 3
conv1 = conv_bn_relu(pad1, filters= 64, kernel_size = 6, strides = 2, name = 'conv_256_to_128')
# Dim conv1 : nbImages x 128 x 128 x 64
r1 = residual(conv1, numOut = 128, name = 'r1')
# Dim r1 : nbImages x 128 x 128 x 128
pool1 = tf.contrib.layers.max_pool2d(r1, [2,2], [2,2], padding='VALID')
# Dim pool1 : nbImages x 64 x 64 x 128
r2 = residual(pool1, numOut= int(nFeat/2), name = 'r2')
r3 = residual(r2, numOut= nFeat, name = 'r3')
hg = hourglass(r3, nLow, nFeat, 'hourglass')
drop = tf.layers.dropout(hg, rate = dropout_rate, training = training, name = 'dropout')
ll = conv_bn_relu(drop, nFeat, 1,1, 'VALID', name = 'conv')
ll_0 = conv2d(ll, nFeat, 1, 1, 'VALID', 'll')
out = conv2d(ll, outDim, 1, 1, 'VALID', 'out')
tf.add_to_collection('heatmaps', out)
out_ = conv2d(out, nFeat, 1, 1, 'VALID', 'out_')
sum_ = tf.add_n([out_, ll, r3], name = 'merge')
for _ in range(1, nStack -1):
hg = hourglass(sum_, nLow, nFeat, 'hourglass')
drop = tf.layers.dropout(hg, rate = dropout_rate, training = training, name = 'dropout')
ll = conv_bn_relu(drop, nFeat, 1, 1, 'VALID', name= 'conv')
ll_ = conv2d(ll, nFeat, 1, 1, 'VALID', 'll')
out = conv2d(ll, outDim, 1, 1, 'VALID', 'out')
tf.add_to_collection('heatmaps', out)
out_ = conv2d(out, nFeat, 1, 1, 'VALID', 'out_')
sum_ = tf.add_n([out_, sum_, ll_0], name= 'merge')
with tf.name_scope('stage_' + str(nStack -1)):
hg = hourglass(sum_, nLow, nFeat, 'hourglass')
drop = tf.layers.dropout(hg, rate = dropout_rate, training = training, name = 'dropout')
ll = conv_bn_relu(drop, nFeat, 1, 1, 'VALID', 'conv')
out = conv2d(ll, outDim, 1,1, 'VALID', 'out')
return out
参考资料: