- Faster R-CNN源码阅读之零:写在前面
- Faster R-CNN源码阅读之一:Faster R-CNN/lib/networks/network.py
- Faster R-CNN源码阅读之二:Faster R-CNN/lib/networks/factory.py
- Faster R-CNN源码阅读之三:Faster R-CNN/lib/networks/VGGnet_test.py
- Faster R-CNN源码阅读之四:Faster R-CNN/lib/rpn_msr/generate_anchors.py
- Faster R-CNN源码阅读之五:Faster R-CNN/lib/rpn_msr/proposal_layer_tf.py
- Faster R-CNN源码阅读之六:Faster R-CNN/lib/fast_rcnn/bbox_transform.py
- Faster R-CNN源码阅读之七:Faster R-CNN/lib/rpn_msr/anchor_target_layer_tf.py
- Faster R-CNN源码阅读之八:Faster R-CNN/lib/rpn_msr/proposal_target_layer_tf.py
- Faster R-CNN源码阅读之九:Faster R-CNN/tools/train_net.py
- Faster R-CNN源码阅读之十:Faster R-CNN/lib/fast_rcnn/train.py
- Faster R-CNN源码阅读之十一:Faster R-CNN预测demo代码补完
- Faster R-CNN源码阅读之十二:写在最后
一、介绍
本demo由Faster R-CNN官方提供,我只是在官方的代码上增加了注释,一方面方便我自己学习,另一方面贴出来和大家一起交流。
该文件中的函数和类的主要目的是产生一个基类,并在类中封装好需要的方法,以后生成网络时可以直接调用已经封装好的方法。
二、代码以及注释
# -*- coding:utf-8 -*-
import numpy as np
import tensorflow as tf
import roi_pooling_layer.roi_pooling_op as roi_pool_op
import roi_pooling_layer.roi_pooling_op_grad
from rpn_msr.proposal_layer_tf import proposal_layer as proposal_layer_py
from rpn_msr.anchor_target_layer_tf import anchor_target_layer as anchor_target_layer_py
from rpn_msr.proposal_target_layer_tf import proposal_target_layer as proposal_target_layer_py
DEFAULT_PADDING = 'SAME'
# 用以修饰层的函数
def layer(op):
def layer_decorated(self, *args, **kwargs):
# Automatically set a name if not provided.如果没有提供名称,则自动设置一个名称
name = kwargs.setdefault('name', self.get_unique_name(op.__name__))
# Figure out the layer inputs. 获取输入层
if len(self.inputs) == 0:
raise RuntimeError('No input variables found for layer %s.' % name)
elif len(self.inputs) == 1:
layer_input = self.inputs[0]
else:
layer_input = list(self.inputs)
# Perform the operation and get the output. 进行层的计算并返回结果
layer_output = op(self, layer_input, *args, **kwargs)
# Add to layer LUT. 保存层到layers变量中
self.layers[name] = layer_output
# This output is now the input for the next layer. 输出是下一层的输入,将其feed到原网络结构中
self.feed(layer_output)
# Return self for chained calls. 返回self,以便可以进行链式调用。不管是conv还是其他的函数,使用layer进行装饰的方法最后返回的都是self
return self
return layer_decorated
# Network是一个基类
class Network(object):
def __init__(self, inputs, trainable=True):
# self.inputs保存的是上一层网络的输出
self.inputs = []
# self.layers保存的是所用的网络层
self.layers = dict(inputs)
# 是否可训练
self.trainable = trainable
self.setup()
# 该方法需要在子类中实现
def setup(self):
raise NotImplementedError('Must be subclassed.')
def load(self, data_path, session, saver, ignore_missing=False):
'''
加载模型
:param data_path: 模型文件的路径
:param session: tf 会话
:param saver: tf的Saver类
:param ignore_missing: 是否忽略缺失值
:return: None
'''
# 如果是ckpt文件,则直接restore
if data_path.endswith('.ckpt'):
saver.restore(session, data_path)
# 否则
else:
# 使用numpy加载数据
data_dict = np.load(data_path).item()
# 对每一个键值都用一个tf Variable保存
for key in data_dict:
with tf.variable_scope(key, reuse=True):
for subkey in data_dict[key]:
try:
var = tf.get_variable(subkey)
session.run(var.assign(data_dict[key][subkey]))
print "assign pretrain model " + subkey + " to " + key
except ValueError:
print "ignore " + key
if not ignore_missing:
raise
def feed(self, *args):
'''
:param args: 不定参数
:return: self
'''
# 未指定参数时,出错
# args一般有两种结构,一个是basestring,表示需要获取的层的键,
# 另一个是tf的Tensor,可以直接加入self.inputs中,供后面使用
assert len(args) != 0
# 将上一层网络的输出清空,因为在之前上一层网络的输出已经被使用
self.inputs = []
# 对被一个传入的参数
for layer in args:
# 如果是basestring,就在self.layers中获取相应的层
if isinstance(layer, basestring):
try:
layer = self.layers[layer]
print layer
except KeyError:
print self.layers.keys()
raise KeyError('Unknown layer name fed: %s' % layer)
# 将获取的层加入self.inputs中,表示网络中当前层的输出
self.inputs.append(layer)
return self
def get_output(self, layer):
'''
根据给定的layer获取相应的网络层
:param layer: 一个字符串,表示网络层的键
:return: 相应的网络层
'''
try:
layer = self.layers[layer]
except KeyError:
print self.layers.keys()
raise KeyError('Unknown layer name fed: %s' % layer)
return layer
def get_unique_name(self, prefix):
'''
根据给定的前缀生成一个不重复的名称
:param prefix: 一个字符串,表示给定的前缀
:return: 具有该前缀的不重复的名称
'''
id = sum(t.startswith(prefix) for t, _ in self.layers.items()) + 1
return '%s_%d' % (prefix, id)
def make_var(self, name, shape, initializer=None, trainable=True):
'''
根据给定的参数生成一个tf variable
:param name: variable的名称
:param shape: variable的形状(shape)
:param initializer: variable的初始化方法
:param trainable: variabe是否可训练
:return: 满足条件的tf variable
'''
return tf.get_variable(name, shape, initializer=initializer, trainable=trainable)
def validate_padding(self, padding):
'''
验证是否是合法的padding方式('SAME'或者'VALID')
:param padding: 给定的padding方式
:return: None
'''
assert padding in ('SAME', 'VALID')
# 以下带有@layer开头的方法使用上面的layer进行装饰
@layer
def conv(self, input, k_h, k_w, c_o, s_h, s_w, name, relu=True, padding=DEFAULT_PADDING, group=1, trainable=True):
'''
卷积函数
:param input: 待卷积的矩阵
:param k_h: 卷积核的高度
:param k_w: 卷积核的宽度
:param c_o: 卷积核的数目
:param s_h: 步长的高度
:param s_w: 步长的宽度
:param name: 操作名称
:param relu: 是否使用relu激活
:param padding: padding方式
:param group: 组数目
:param trainable: 是否可训练
:return: 卷积后的矩阵
'''
self.validate_padding(padding)
c_i = input.get_shape()[-1]
assert c_i % group == 0
assert c_o % group == 0
convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding)
with tf.variable_scope(name) as scope:
init_weights = tf.truncated_normal_initializer(0.0, stddev=0.01)
init_biases = tf.constant_initializer(0.0)
kernel = self.make_var('weights', [k_h, k_w, c_i / group, c_o], init_weights, trainable)
biases = self.make_var('biases', [c_o], init_biases, trainable)
if group == 1:
conv = convolve(input, kernel)
else:
input_groups = tf.split(3, group, input)
kernel_groups = tf.split(3, group, kernel)
output_groups = [convolve(i, k) for i, k in zip(input_groups, kernel_groups)]
conv = tf.concat(3, output_groups)
if relu:
bias = tf.nn.bias_add(conv, biases)
return tf.nn.relu(bias, name=scope.name)
return tf.nn.bias_add(conv, biases, name=scope.name)
@layer
def relu(self, input, name):
'''
relu激活
:param input: 待激活的矩阵
:param name: 名称
:return: 激活后的矩阵
'''
return tf.nn.relu(input, name=name)
@layer
def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=DEFAULT_PADDING):
'''
最大池化
:param input: 待池化的矩阵
:param k_h: 池化核的高度
:param k_w: 池化核的宽度
:param s_h: 步长的高度
:param s_w: 步长的宽度
:param name: 名称
:param padding: padding方式
:return: 池化后的矩阵
'''
self.validate_padding(padding)
return tf.nn.max_pool(input,
ksize=[1, k_h, k_w, 1],
strides=[1, s_h, s_w, 1],
padding=padding,
name=name)
@layer
def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=DEFAULT_PADDING):
'''
平均池化
:param input: 待池化的矩阵
:param k_h: 池化核的高度
:param k_w: 池化核的宽度
:param s_h: 步长的高度
:param s_w: 步长的宽度
:param name: 名称
:param padding: padding方式
:return: 池化后的矩阵
'''
self.validate_padding(padding)
return tf.nn.avg_pool(input,
ksize=[1, k_h, k_w, 1],
strides=[1, s_h, s_w, 1],
padding=padding,
name=name)
@layer
def roi_pool(self, input, pooled_height, pooled_width, spatial_scale, name):
'''
roi pooling层,
:param input: 需要池化的矩阵信息,里面包含特征层和rois两部分
:param pooled_height: 池化之后的矩阵高度
:param pooled_width: 池化之后的矩阵宽度
:param spatial_scale: 空间尺度,一般是缩放总步长的倒数
:param name: 名称
:return: 池化后的矩阵
'''
# only use the first input
if isinstance(input[0], tuple):
input[0] = input[0][0]
if isinstance(input[1], tuple):
input[1] = input[1][0]
print input
return roi_pool_op.roi_pool(input[0], input[1],
pooled_height,
pooled_width,
spatial_scale,
name=name)[0]
@layer
def proposal_layer(self, input, _feat_stride, anchor_scales, cfg_key, name):
'''
:param input: 输入矩阵
:param _feat_stride: 特征步长,一般是一个整数组成的list
:param anchor_scales: anchor的尺寸,一般是一个整数组成的list
:param cfg_key: 相关的配置信息,是一个字符串
:param name: 名称
:return: 排序之后的TOP N个proposals的batch inds和坐标
'''
if isinstance(input[0], tuple):
input[0] = input[0][0]
return tf.reshape(
tf.py_func(proposal_layer_py, [input[0], input[1], input[2], cfg_key, _feat_stride, anchor_scales],
[tf.float32]), [-1, 5], name=name)
@layer
def anchor_target_layer(self, input, _feat_stride, anchor_scales, name):
'''
Assign anchors to ground-truth targets. Produces anchor classification
labels and bounding-box regression targets.
将anchors和ground truth目标对齐,产生对应anchor的分类标签和bbox回归目标。
:param input: 输入矩阵
:param _feat_stride: 特征步长,一般是一个整数组成的list
:param anchor_scales: anchor的尺寸,一般是一个整数组成的list
:param name: 名称
:return: rpn的分类标签和bbox的回归目标,rpn的bbox的内部权重和外部权重
'''
if isinstance(input[0], tuple):
input[0] = input[0][0]
with tf.variable_scope(name) as scope:
rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = tf.py_func(
anchor_target_layer_py, [input[0], input[1], input[2], input[3], _feat_stride, anchor_scales],
[tf.float32, tf.float32, tf.float32, tf.float32])
rpn_labels = tf.convert_to_tensor(tf.cast(rpn_labels, tf.int32), name='rpn_labels')
rpn_bbox_targets = tf.convert_to_tensor(rpn_bbox_targets, name='rpn_bbox_targets')
rpn_bbox_inside_weights = tf.convert_to_tensor(rpn_bbox_inside_weights, name='rpn_bbox_inside_weights')
rpn_bbox_outside_weights = tf.convert_to_tensor(rpn_bbox_outside_weights, name='rpn_bbox_outside_weights')
return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights
@layer
def proposal_target_layer(self, input, classes, name):
"""
Assign object detection proposals to ground-truth targets. Produces proposal
classification labels and bounding-box regression targets.
将目标检测的proposals和ground truth对齐,产生proposa的分类标签和bbox回归目标
:param input: rpn_rois和gt_boxes
:param classes: 类别数目
:param name: 名称
:return: rois,rois的标签,bbox目标,bbox内部权重,bbox外部权重
"""
if isinstance(input[0], tuple):
input[0] = input[0][0]
with tf.variable_scope(name) as scope:
rois, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights = tf.py_func(proposal_target_layer_py,
[input[0], input[1],
classes],
[tf.float32, tf.float32,
tf.float32, tf.float32,
tf.float32])
rois = tf.reshape(rois, [-1, 5], name='rois')
labels = tf.convert_to_tensor(tf.cast(labels, tf.int32), name='labels')
bbox_targets = tf.convert_to_tensor(bbox_targets, name='bbox_targets')
bbox_inside_weights = tf.convert_to_tensor(bbox_inside_weights, name='bbox_inside_weights')
bbox_outside_weights = tf.convert_to_tensor(bbox_outside_weights, name='bbox_outside_weights')
return rois, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights
@layer
def reshape_layer(self, input, d, name):
'''
重新整理矩阵的shape
:param input: 输入矩阵
:param d:
:param name: 名称
:return: 整理之后的矩阵
'''
input_shape = tf.shape(input)
if name == 'rpn_cls_prob_reshape':
# step 1: tmp1 = tf.transpose(input, [0, 3, 1, 2]) # [N, H, W, C]通道顺序更改为[N, C, H, W]
# step 2: dim = tf.cast(tf.cast(input_shape[1], tf.float32) / tf.cast(d, tf.float32) * tf.cast(input_shape[3], tf.float32), tf.int32)
# # 去掉tf.cast之后: dim = input_shape[1] / d * input_shape[3]
# # 计算reshape需要的第三个维度
# step 3: tmp2 = tf.reshape(tmp1, [input_shape[0], int(d), dim, input_shape[2]) # reshape
# stap 4: tmp3 = tf.transpose(tmp2, [0, 2, 3, 1], name=name) # 恢复原先的通道顺序
# step 5: return tmp3
return tf.transpose(tf.reshape(tf.transpose(input, [0, 3, 1, 2]), [input_shape[0],
int(d), tf.cast(
tf.cast(input_shape[1], tf.float32) / tf.cast(d, tf.float32) * tf.cast(input_shape[3], tf.float32),
tf.int32), input_shape[2]]), [0, 2, 3, 1], name=name)
else:
# step 1: tmp1 = tf.transpose(input, [0, 3, 1, 2]) # [N, H, W, C]通道顺序更改为[N, C, H, W]
# step 2: dim = tf.cast(tf.cast(input_shape[1], tf.float32) * (tf.cast(input_shape[3], tf.float32) / tf.cast(d, tf.float32)), tf.int32)
# # 去掉tf.cast之后: dim = input_shape[1] * (input_shape[3] / d)
# # 计算reshape需要的第三个维度
# step 3: tmp2 = tf.reshape(tmp1, [input_shape[0], int(d), dim, input_shape[2]) # reshape
# stap 4: tmp3 = tf.transpose(tmp2, [0, 2, 3, 1], name=name) # 恢复原先的通道顺序
# step 5: return tmp3
return tf.transpose(tf.reshape(tf.transpose(input, [0, 3, 1, 2]), [input_shape[0],
int(d), tf.cast(
tf.cast(input_shape[1], tf.float32) * (
tf.cast(input_shape[3], tf.float32) / tf.cast(d, tf.float32)), tf.int32),
input_shape[2]]), [0, 2, 3, 1],
name=name)
@layer
def feature_extrapolating(self, input, scales_base, num_scale_base, num_per_octave, name):
'''
:param input:
:param scales_base:
:param num_scale_base:
:param num_per_octave:
:param name:
:return:
'''
return feature_extrapolating_op.feature_extrapolating(input,
scales_base,
num_scale_base,
num_per_octave,
name=name)
@layer
def lrn(self, input, radius, alpha, beta, name, bias=1.0):
'''
local response normalization,局部响应正则化
:param input: 输入矩阵
:param radius: depth_radius
:param alpha: alpha
:param beta: beta
:param name: 名称
:param bias: 偏置量
:return: lrn之后的矩阵
'''
return tf.nn.local_response_normalization(input,
depth_radius=radius,
alpha=alpha,
beta=beta,
bias=bias,
name=name)
@layer
def concat(self, inputs, axis, name):
'''
按照指定的维度连接若干矩阵
:param inputs: 输入的矩阵序列
:param axis: 连接维度
:param name: 名称
:return: 连接之后的矩阵
'''
return tf.concat(concat_dim=axis, values=inputs, name=name)
@layer
def fc(self, input, num_out, name, relu=True, trainable=True):
'''
全连接层
:param input: 输入矩阵
:param num_out: 输出维度
:param name: 名称
:param relu: 是否使用relu激活
:param trainable: 是否可训练
:return: 全连接层
'''
with tf.variable_scope(name) as scope:
# only use the first input
if isinstance(input, tuple):
input = input[0]
input_shape = input.get_shape()
if input_shape.ndims == 4:
dim = 1
for d in input_shape[1:].as_list():
dim *= d
feed_in = tf.reshape(tf.transpose(input, [0, 3, 1, 2]), [-1, dim])
else:
feed_in, dim = (input, int(input_shape[-1]))
if name == 'bbox_pred':
init_weights = tf.truncated_normal_initializer(0.0, stddev=0.001)
init_biases = tf.constant_initializer(0.0)
else:
init_weights = tf.truncated_normal_initializer(0.0, stddev=0.01)
init_biases = tf.constant_initializer(0.0)
weights = self.make_var('weights', [dim, num_out], init_weights, trainable)
biases = self.make_var('biases', [num_out], init_biases, trainable)
op = tf.nn.relu_layer if relu else tf.nn.xw_plus_b
fc = op(feed_in, weights, biases, name=scope.name)
return fc
@layer
def softmax(self, input, name):
'''
softmax层
:param input: 输入矩阵
:param name: 名称
:return: softmax层
'''
input_shape = tf.shape(input)
if name == 'rpn_cls_prob':
return tf.reshape(tf.nn.softmax(tf.reshape(input, [-1, input_shape[3]])),
[-1, input_shape[1], input_shape[2], input_shape[3]], name=name)
else:
return tf.nn.softmax(input, name=name)
@layer
def dropout(self, input, keep_prob, name):
'''
dropout层
:param input: 输入矩阵
:param keep_prob: 保留概率
:param name: 名称
:return: dropout层
'''
return tf.nn.dropout(input, keep_prob, name=name)