前言
最近其实学的都不多,最近有点迷茫,我写这些东西的初衷是记录自己的学习历程,最近觉得有点累,而且开学就研二了,研究命题还没有确定,虽然大方向差不多但是还是有点迷茫,再加上520朋友圈虐狗着实有点累,不过生活还得继续,成长注定是条孤独中坚强的路,大家互勉,吐口老血继续,加油。
input_pipeline()
def input_pipeline(dataset_pattern='train-*', is_training=True, batch_size=FLAGS.batch_size):
def input_fn():
out_shape = [FLAGS.train_image_size] * 2
anchor_creator = anchor_manipulator.AnchorCreator(out_shape,
layers_shapes = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)],
anchor_scales = [(0.1,), (0.2,), (0.375,), (0.55,), (0.725,), (0.9,)],
extra_anchor_scales = [(0.1414,), (0.2739,), (0.4541,), (0.6315,), (0.8078,), (0.9836,)],
anchor_ratios = [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)],
layer_steps = [8, 16, 32, 64, 100, 300])
all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors()
num_anchors_per_layer = []
for ind in range(len(all_anchors)):
num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind])
anchor_encoder_decoder = anchor_manipulator.AnchorEncoder(allowed_borders = [1.0] * 6,
positive_threshold = FLAGS.match_threshold,
ignore_threshold = FLAGS.neg_threshold,
prior_scaling=[0.1, 0.1, 0.2, 0.2])
image_preprocessing_fn = lambda image_, labels_, bboxes_ : ssd_preprocessing.preprocess_image(image_, labels_, bboxes_, out_shape, is_training=is_training, data_format=FLAGS.data_format, output_rgb=False)
anchor_encoder_fn = lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(glabels_, gbboxes_, all_anchors, all_num_anchors_depth, all_num_anchors_spatial)
image, _, shape, loc_targets, cls_targets, match_scores = dataset_common.slim_get_batch(FLAGS.num_classes,
batch_size,
('train' if is_training else 'val'),
os.path.join(FLAGS.data_dir, dataset_pattern),
FLAGS.num_readers,
FLAGS.num_preprocessing_threads,
image_preprocessing_fn,
anchor_encoder_fn,
num_epochs=FLAGS.train_epochs,
is_training=is_training)
global global_anchor_info
global_anchor_info = {'decode_fn': lambda pred : anchor_encoder_decoder.decode_all_anchors(pred, num_anchors_per_layer),
'num_anchors_per_layer': num_anchors_per_layer,
'all_num_anchors_depth': all_num_anchors_depth }
return image, {'shape': shape, 'loc_targets': loc_targets, 'cls_targets': cls_targets, 'match_scores': match_scores}
return input_fn
我们先逐句注解吧:
out_shape = [FLAGS.train_image_size] * 2
这句话做的是构造输出尺寸out_shape=[300,300]。
anchor_creator = anchor_manipulator.AnchorCreator(out_shape,
layers_shapes = [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)],
anchor_scales = [(0.1,), (0.2,), (0.375,), (0.55,), (0.725,), (0.9,)],
extra_anchor_scales = [(0.1414,), (0.2739,), (0.4541,), (0.6315,), (0.8078,), (0.9836,)],
anchor_ratios = [(1., 2., .5), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., 3., .5, 0.3333), (1., 2., .5), (1., 2., .5)],
layer_steps = [8, 16, 32, 64, 100, 300])
all_anchors, all_num_anchors_depth, all_num_anchors_spatial = anchor_creator.get_all_anchors()
这几句是用来构造anchors的,第一个函数是用来传入相关参数,相信大家都看得懂,第二句是根据函数构造anchors,后两个参数是每个店anchor的个数和像素点数。这里涉及两个自定义函数:AnchorCreator这个函数比较复杂我们分步解析,这里主要说一下get_all_anchors:
def get_all_anchors(self):
all_anchors = []
all_num_anchors_depth = []
all_num_anchors_spatial = []
for layer_index, layer_shape in enumerate(self._layers_shapes):
anchors_this_layer = self.get_layer_anchors(layer_shape,
self._anchor_scales[layer_index],
self._extra_anchor_scales[layer_index],
self._anchor_ratios[layer_index],
self._layer_steps[layer_index],
self._anchor_offset[layer_index])
all_anchors.append(anchors_this_layer[:-2])
all_num_anchors_depth.append(anchors_this_layer[-2])
all_num_anchors_spatial.append(anchors_this_layer[-1])
return all_anchors, all_num_anchors_depth, all_num_anchors_spatial
这个函数是为6个特征图合并,每个特征图生成anchor然后放在一起,这里有两个参数要讲: all_num_anchors_depth这个参数是这样的[4,6,6,6,4,4]代表每个特征图的anchor数,这个是每个特征图的尺寸,然后是获取具体的anchors的信息,分解后all_anchors为anchor的坐标信息,剩下两个前面说过。这里引出了另外一个函数:get_layer_anchors。
def get_layer_anchors(self, layer_shape, anchor_scale, extra_anchor_scale, anchor_ratio, layer_step, offset = 0.5):
''' assume layer_shape[0] = 6, layer_shape[1] = 5
x_on_layer = [[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4]]
y_on_layer = [[0, 0, 0, 0, 0],
[1, 1, 1, 1, 1],
[2, 2, 2, 2, 2],
[3, 3, 3, 3, 3],
[4, 4, 4, 4, 4],
[5, 5, 5, 5, 5]]
'''
with tf.name_scope('get_layer_anchors'):
x_on_layer, y_on_layer = tf.meshgrid(tf.range(layer_shape[1]), tf.range(layer_shape[0]))
y_on_image = (tf.cast(y_on_layer, tf.float32) + offset) * layer_step / self._img_shape[0]
x_on_image = (tf.cast(x_on_layer, tf.float32) + offset) * layer_step / self._img_shape[1]
num_anchors_along_depth = len(anchor_scale) * len(anchor_ratio) + len(extra_anchor_scale)
num_anchors_along_spatial = layer_shape[1] * layer_shape[0]
list_h_on_image = []
list_w_on_image = []
global_index = 0
# for square anchors
for _, scale in enumerate(extra_anchor_scale):
list_h_on_image.append(scale)
list_w_on_image.append(scale)
global_index += 1
# for other aspect ratio anchors
for scale_index, scale in enumerate(anchor_scale):
for ratio_index, ratio in enumerate(anchor_ratio):
list_h_on_image.append(scale / math.sqrt(ratio))
list_w_on_image.append(scale * math.sqrt(ratio))
global_index += 1
# shape info:
# y_on_image, x_on_image: layers_shapes[0] * layers_shapes[1]
# h_on_image, w_on_image: num_anchors_along_depth
return tf.expand_dims(y_on_image, axis=-1), tf.expand_dims(x_on_image, axis=-1), \
tf.constant(list_h_on_image, dtype=tf.float32), \
tf.constant(list_w_on_image, dtype=tf.float32), num_anchors_along_depth, num_anchors_along_spatial
tf.meshgrid这个函数的作用是构造一个【5,6】的矩阵,x_on_layer, y_on_layer相当于特征图上每个坐标,然后在转化为原图上的坐标并且进行归一化操作。list_h_on_image,list_w_on_image则是anchor的宽高,这个函数完成了anchor的y坐标,x坐标,高,宽,一个点几个anchor,共几个点的计算。
然后我们返回到主代码;
for ind in range(len(all_anchors)):
num_anchors_per_layer.append(all_num_anchors_depth[ind] * all_num_anchors_spatial[ind])
这里是计算出每一个特征图的anchor数格式为[5776,2166,600,150,36,4]。
image_preprocessing_fn = lambda image_, labels_, bboxes_ : ssd_preprocessing.preprocess_image(image_, labels_, bboxes_, out_shape, is_training=is_training, data_format=FLAGS.data_format, output_rgb=False)
anchor_encoder_fn = lambda glabels_, gbboxes_: anchor_encoder_decoder.encode_all_anchors(glabels_, gbboxes_, all_anchors, all_num_anchors_depth, all_num_anchors_spatial)
image, _, shape, loc_targets, cls_targets, match_scores = dataset_common.slim_get_batch(FLAGS.num_classes,
batch_size,
('train' if is_training else 'val'),
os.path.join(FLAGS.data_dir, dataset_pattern),
FLAGS.num_readers,
FLAGS.num_preprocessing_threads,
image_preprocessing_fn,
anchor_encoder_fn,
num_epochs=FLAGS.train_epochs,
is_training=is_training)
这里是定义了两个匿名函数然后调用,ssd_preprocessing.py 图像预处理,做一些水平翻转之类的预处理,扩展数据集。anchor_manipulator.py 进行编码(实际坐标转换成偏移量),标记正负样本信息等。调用后按batch_size获取图像及标注信息。
global global_anchor_info
global_anchor_info = {'decode_fn': lambda pred : anchor_encoder_decoder.decode_all_anchors(pred, num_anchors_per_layer),
'num_anchors_per_layer': num_anchors_per_layer,
'all_num_anchors_depth': all_num_anchors_depth }
这里主要是将预测框转化为实际坐标:这里用到一个函数:decode_all_anchors正好和上面的编码函数encode_all_anchors互为逆过程。
总结一下input_pipeline函数主要功能是,读取图像,标签,并且生成anchor,这块看似不起眼,其实是核心,无论因为一个框架的核心无非是预测框的修正,而这里正是预测框的生成。
最后
这一部分就说完了,剩下的就应该是函数的主网络结构和损失部分了。写的不太好望请见谅,这里向各位前辈致以诚挚敬意。