Cascaded Pyramid Network
1 网络结构
2 核心思想
- 联级网络结构;
- coarse-to-fine(由粗到细)的思想;
- 难关键点挖掘。
3 代码
3.1 构造网络
fpn的横向连接是要加bn、激活和bias的。 上采样以后的卷积是不加bn和激活的
def create_global_net(blocks, is_training, trainable=True):
"""global net"""
global_fms = [] # feature maps, global_fms是做为传入到refinenet用的。
global_outs = [] ## 这个是作为中间监督用的。
last_fm = None
initializer = tf.contrib.layers.xavier_initializer()
for i, block in enumerate(reversed(blocks)):
with slim.arg_scope(resnet_arg_scope(bn_is_training=is_training)):
# 横向连接
lateral = slim.conv2d(block, 256, [1, 1],
trainable=trainable, weights_initializer=initializer,
padding='SAME', activation_fn=tf.nn.relu,
scope='lateral/res{}'.format(5-i)) ## 5,4,3,2,1
if last_fm is not None:
sz = tf.shape(lateral) # size
# upsample
upsample = tf.image.resize_bilinear(last_fm, (sz[1], sz[2]),
name='upsample/res{}'.format(5-i))
# 在FPN的原论文中这里做的是3x3的卷积,但是CPN论文中说这里改了变成了1x1的卷积,然后再相加。
upsample = slim.conv2d(upsample, 256, [1, 1],
trainable=trainable, weights_initializer=initializer,
padding='SAME', activation_fn=None,
scope='merge/res{}'.format(5-i))
last_fm = upsample + lateral
else:
last_fm = lateral # 先c5这里不需要和别的层相加
global_fms.append(last_fm)
## 这里的两个with都是arg_scope
## 下面这两部分卷积的输出是用来做中间的监督loss用的。
with slim.arg_scope(resnet_arg_scope(bn_is_training=is_training)):
tmp = slim.conv2d(last_fm, 256, [1, 1],
trainable=trainable, weights_initializer=initializer,
padding='SAME', activation_fn=tf.nn.relu,
scope='tmp/res{}'.format(5-i))
## 这里是对的,和原文中的一样,文中说,3x3 convolution filters are applied on c2, ..., c5 to generate the heatmaps for keypoints.
out = slim.conv2d(tmp, cfg.nr_skeleton, [3, 3],
trainable=trainable, weights_initializer=initializer,
padding='SAME', activation_fn=None,
scope='pyramid/res{}'.format(5-i))
## 本来每个block的特征图相加以后输出的大小都是和上采样传进来的一样,现在监督的时候要规定到指定大小,
# 这里的话是原图的四分之一大小(64, 48),可能监督的label就是这么大的吧。
global_outs.append(tf.image.resize_bilinear(out, (cfg.output_shape[0], cfg.output_shape[1])))
## 注意这里又逆序了一下,所以存的是上采样层的p2,p3,p4,p5
global_fms.reverse()
global_outs.reverse()
return global_fms, global_outs
def create_refine_net(blocks, is_training, trainable=True):
## 初始化的参数
initializer = tf.contrib.layers.xavier_initializer()
bottleneck = resnet_v1.bottleneck
refine_fms = [] ## 每个block的最后一层输出
for i, block in enumerate(blocks): ## 上采样层的p2,p3,p4,p5
mid_fm = block
## 默认参数arg_scope
with slim.arg_scope(resnet_arg_scope(bn_is_training=is_training)):
# 第0个block不执行, 特征图小的要多加工几次,因为在u型的底部卷积的次数少,特征层越靠后的卷积次数多,
# 所以前面的正好在refinenet的时候要多卷积一次,这样每个输出卷积的次数相同。
for j in range(i):
mid_fm = bottleneck(mid_fm, 256, 128, stride=1, scope='res{}/refine_conv{}'.format(2+i, j)) # no projection
# 直接上采样到指定大小了(64, 48),原图的四分之一大小
mid_fm = tf.image.resize_bilinear(mid_fm, (cfg.output_shape[0], cfg.output_shape[1]),
name='upsample_conv/res{}'.format(2+i))
refine_fms.append(mid_fm)
## 将refineNet的输出concat到一块,然后在进行卷积,最后卷积输出的通道数为关键点的个数。
refine_fm = tf.concat(refine_fms, axis=3)
with slim.arg_scope(resnet_arg_scope(bn_is_training=is_training)):
### inputs, depth, depth_bottleneck, stride
refine_fm = bottleneck(refine_fm, 256, 128, stride=1, scope='final_bottleneck')
res = slim.conv2d(refine_fm, cfg.nr_skeleton, [3, 3],
trainable=trainable, weights_initializer=initializer,
padding='SAME', activation_fn=None,
scope='refine_out')
return res
class Network(ModelDesc):
def make_data(self):
from COCOAllJoints import COCOJoints
from dataset import Preprocessing
d = COCOJoints()
train_data, _ = d.load_data(cfg.min_kps)
from tfflat.data_provider import DataFromList, MultiProcessMapDataZMQ, BatchData, MapData
dp = DataFromList(train_data)
if cfg.dpflow_enable:
dp = MultiProcessMapDataZMQ(dp, cfg.nr_dpflows, Preprocessing)
else:
dp = MapData(dp, Preprocessing)
dp = BatchData(dp, cfg.batch_size // cfg.nr_aug)
dp.reset_state()
dataiter = dp.get_data()
return dataiter
def make_network(self, is_train):
if is_train:
image = tf.placeholder(tf.float32, shape=[cfg.batch_size, *cfg.data_shape, 3])
label15 = tf.placeholder(tf.float32, shape=[cfg.batch_size, *cfg.output_shape, cfg.nr_skeleton])
label11 = tf.placeholder(tf.float32, shape=[cfg.batch_size, *cfg.output_shape, cfg.nr_skeleton])
label9 = tf.placeholder(tf.float32, shape=[cfg.batch_size, *cfg.output_shape, cfg.nr_skeleton])
label7 = tf.placeholder(tf.float32, shape=[cfg.batch_size, *cfg.output_shape, cfg.nr_skeleton])
valids = tf.placeholder(tf.float32, shape=[cfg.batch_size, cfg.nr_skeleton])
labels = [label15, label11, label9, label7]
# labels.reverse() # The original labels are reversed. For reproduction of our pre-trained model, I'll keep it same.
self.set_inputs(image, label15, label11, label9, label7, valids)
else:
image = tf.placeholder(tf.float32, shape=[None, *cfg.data_shape, 3])
self.set_inputs(image)
resnet_fms = resnet101(image, is_train, bn_trainable=True)
global_fms, global_outs = create_global_net(resnet_fms, is_train)
refine_out = create_refine_net(global_fms, is_train) ## 这个输出直接计算refine网络的loss,且输出是一个网络的输出结果
# make loss
if is_train:
## online hard keypoint mining
def ohkm(loss, top_k):
ohkm_loss = 0.
for i in range(cfg.batch_size):
### 对batch中的每个样本的loss进行分析
sub_loss = loss[i]
topk_val, topk_idx = tf.nn.top_k(sub_loss, k=top_k, sorted=False, name='ohkm{}'.format(i)) ## 返回最后一个维度上前k个最大的值和坐标
# (这里的就是最好一个类别的channel,前k个就是前k个关键点,前k张featuremap),
# 所以返回的矩阵大小为 [sub_loss.shape[-1], k]
tmp_loss = tf.gather(sub_loss, topk_idx, name='ohkm_loss{}'.format(i)) # can be ignore ??? 获得者几个关键的loss然后计算损失 最后返回结果。
ohkm_loss += tf.reduce_sum(tmp_loss) / top_k
ohkm_loss /= cfg.batch_size
return ohkm_loss
global_loss = 0.
## global的loss
for i, (global_out, label) in enumerate(zip(global_outs, labels)):
## global网络的loss是每一层的loss单独计算,然后再相加最后除以2
global_label = label * tf.to_float(tf.greater(tf.reshape(valids, (-1, 1, 1, cfg.nr_skeleton)), 1.1))
global_loss += tf.reduce_mean(tf.square(global_out - global_label)) / len(labels)
global_loss /= 2.
self.add_tower_summary('global_loss', global_loss)
### refine的loss
refine_loss = tf.reduce_mean(tf.square(refine_out - label7), (1,2)) * tf.to_float((tf.greater(valids, 0.1))) ## 其实这里的reducemean还是保留着batch的
refine_loss = ohkm(refine_loss, 8) ## online hard keypoint mining
self.add_tower_summary('refine_loss', refine_loss)
total_loss = refine_loss + global_loss ## 最终的loss是两个loss相加
self.add_tower_summary('loss', total_loss)
self.set_loss(total_loss)
else:
self.set_outputs(refine_out)
3.2 heatmap生成
def joints_heatmap_gen(data, label, tar_size=cfg.output_shape, ori_size=cfg.data_shape, points=cfg.nr_skeleton,
return_valid=False, gaussian_kernel=cfg.gaussain_kernel):
if return_valid:
valid = np.ones((len(data), points), dtype=np.float32)
ret = np.zeros((len(data), points, tar_size[0], tar_size[1]), dtype='float32') ## batch_size, channel, width, hight
for i in range(len(ret)):
for j in range(points):
if label[i][j << 1] < 0 or label[i][j << 1 | 1] < 0:
## 如果有个坐标是-1的话,那么就不画点,那应该是这个关键点不存在。
continue
# hight
label[i][j << 1 | 1] = min(label[i][j << 1 | 1], ori_size[0] - 1) ## min 不要超过边界
# width
label[i][j << 1] = min(label[i][j << 1], ori_size[1] - 1) ## 不要超过边界
## 因为输出大小和原图的大小不一样,所以需要对label的坐标进行放缩,乘以一个(tar_size[0] / ori_size[0])尺度变换
## 这里是给mask赋值,在第i张图片上,第j个keypoint图层,
# 第int(label[i][j << 1 | 1] * (tar_size[0] / ori_size[0]) 行,
# 第int(label[i][j << 1] * (tar_size[1] / ori_size[1]) )列赋值1.
ret[i][j][int(label[i][j << 1 | 1] * (tar_size[0] / ori_size[0]) )][int(label[i][j << 1] * (tar_size[1] / ori_size[1]) )] = 1
for i in range(len(ret)):
for j in range(points):
## 对每个keypoint的图层进行一个高斯变换。
ret[i, j] = cv2.GaussianBlur(ret[i, j], gaussian_kernel, 0)
for i in range(len(ret)):
for j in range(cfg.nr_skeleton):
am = np.amax(ret[i][j]) ## Return the maximum of an array or maximum along an axis.就是计算最大值
## 如果最大值小于1e-8,也就是等于0,那么这一层就赋值为0
if am <= 1e-8:
if return_valid:
valid[i][j] = 0.
continue
ret[i][j] /= am / 255 ## 等于 ret[i][j] = ret[i][j] / (am / 255) ,array([[ 0., 17., 34., 51.],
## [ 68., 85., 102., 119.],
## [136., 153., 170., 187.],
## [204., 221., 238., 255.]]) 类似于这样将数据缩放到255
## 也就等于ret[i][j] / float(am) ,这样先缩放到0-1直接,然后再乘以255.
if return_valid:
return ret, valid
else:
return ret