Faster R-CNN TensorFlow版的源码
传送门:Faster R-CNN网络解读
1.文件目录结构
2.train.py网络训练
训练的主要执行过程和函数调用关系如下图所示,对应源码的部分下文有较详细的注释.
3 Train类
网络训练的核心是Train类,Train类的__init__函数进行了数据加载的全部操作,Train类的函数tain()实现了网络构建和迭代训练的操作。
class Train:
def __init__(self):
# Create network
if cfg.FLAGS.network == 'vgg16':
self.net = vgg16(batch_size=cfg.FLAGS.ims_per_batch)
else:
raise NotImplementedError
# 加载数据集
self.imdb, self.roidb = combined_roidb("voc_2007_trainval")
self.data_layer = RoIDataLayer(self.roidb, self.imdb.num_classes)
self.output_dir = cfg.get_output_dir(self.imdb, 'default') # 模型保存位置
def train(self):
# Create session
tfconfig = tf.ConfigProto(allow_soft_placement=True)
tfconfig.gpu_options.allow_growth = True
sess = tf.Session(config=tfconfig)
with sess.graph.as_default():
tf.set_random_seed(cfg.FLAGS.rng_seed)
# 建立一个网络架构
layers = self.net.create_architecture(sess, "TRAIN", self.imdb.num_classes, tag='default')
loss = layers['total_loss']
lr = tf.Variable(cfg.FLAGS.learning_rate, trainable=False)
momentum = cfg.FLAGS.momentum
optimizer = tf.train.MomentumOptimizer(lr, momentum)
gvs = optimizer.compute_gradients(loss)
# ......
4 __init__(self)Train类的初始化函数
4.1 combined_roidb(imdb_names)由数据集名称准备加载对应数据
考虑到可能不只有一个数据集,roidbs = [get_roidb(s) for s in imdb_names.split('+')]将用+连接的数据集分别加载进来。
imdb = get_imdb(imdb_name)调用中构造了两个类:数据集类pascal_voc和图片类imdb,见下文。
def combined_roidb(imdb_names):
"""
Combine multiple roidbs
"""
def get_roidb(imdb_name):
imdb = get_imdb(imdb_name) # 构造了两个类:数据集类pascal_voc和图片类imdb
print('Loaded dataset `{:s}` for training'.format(imdb.name))
imdb.set_proposal_method("gt") # 即选择真实物体groundtruth
print('Set proposal method: {:s}'.format("gt"))
roidb = get_training_roidb(imdb) # 获得训练数据
return roidb
# 考虑到可能不只有一个数据集,将用+连接的数据集分别加载进来
roidbs = [get_roidb(s) for s in imdb_names.split('+')]
roidb = roidbs[0]
if len(roidbs) > 1:
for r in roidbs[1:]:
roidb.extend(r)
tmp = get_imdb(imdb_names.split('+')[1])
imdb = imdb2(imdb_names, tmp.classes)
else:
imdb = get_imdb(imdb_names)
return imdb, roidb
4.2 factory.py工厂类
利用lambda表达式像工厂一样自定义自己所需的数据库类,返回数据库供网络训练和测试使用
# Set up voc_<year>_<split>
for year in ['2007', '2012']:
for split in ['train', 'val', 'trainval', 'test']:
name = 'voc_{}_{}'.format(year, split)
__sets[name] = (lambda split=split, year=year: pascal_voc(split, year)) # pascal_voc类
4.3 class pascal_voc(imdb)
pascal_voc是继承自imdb的类,主要针对数据集中生成roidb
class pascal_voc(imdb):
def __init__(self, image_set, year, devkit_path=None):
imdb.__init__(self, 'voc_' + year + '_' + image_set) # 构造imdb
self._year = year
self._image_set = image_set
self._devkit_path = self._get_default_path() if devkit_path is None \
else devkit_path
self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year)
self._classes = ('__background__', # always index 0
'aeroplane', 'bicycle', 'bird', 'boat',
'bottle', 'bus', 'car', 'cat', 'chair',
'cow', 'diningtable', 'dog', 'horse',
'motorbike', 'person', 'pottedplant',
'sheep', 'sofa', 'train', 'tvmonitor')
self._class_to_ind = dict(list(zip(self.classes, list(range(self.num_classes)))))
self._image_ext = '.jpg'
self._image_index = self._load_image_set_index()
# Default to roidb handler
self._roidb_handler = self.gt_roidb
self._salt = str(uuid.uuid4())
self._comp_id = 'comp4'
# ......
4.4 class imdb(object)数据库读写类
是数据库读写类的基类,封装了许多db的操作
def __init__(self, name, classes=None):
self._name = name # 指定名字
self._num_classes = 0 # 当前数据集一共做多少分类
if not classes:
self._classes = []
else:
self._classes = classes
self._image_index = []
self._obj_proposer = 'gt'
self._roidb = None # 得到的结果
self._roidb_handler = self.default_roidb
# Use this dict for storing dataset specific config options
self.config = {}
# ......
4.5 get_training_roidb(imdb)获得训练数据
再回到get_roidb(imdb_name)函数中,get_training_roidb(imdb)是在获得训练数据,包括
- 加载数据并翻转(镜像),数据量翻倍
- 对roidb做一些处理(图像大小、路径),方便操作
def get_training_roidb(imdb):
"""Returns a roidb (Region of Interest database) for use in training."""
if True:
print('Appending horizontally-flipped training examples...')
imdb.append_flipped_images() # 加载数据并翻转(镜像),数据量翻倍
print('done')
print('Preparing training data...')
rdl_roidb.prepare_roidb(imdb)
print('done')
return imdb.roidb
4.6 imdb.append_flipped_images()翻转原始图像
翻转即:
主要进行了:
- 得到当前的数据总量
- 得到图像宽用于坐标变换
- 所有的图都做一次镜像
- 得到原图每个物体的框,boxes = self.roidb[i]['boxes'].copy()详细见下文
- 将翻转后的图添加到加载的原图后
def append_flipped_images(self):
num_images = self.num_images # 得到当前的数据总量
widths = self._get_widths() # 得到图像宽,用来变换坐标:图像镜像以后,框的坐标也一起改变
for i in range(num_images): # 所有的图都做一次镜像
boxes = self.roidb[i]['boxes'].copy() # 得到原图每个物体的框
oldx1 = boxes[:, 0].copy()
oldx2 = boxes[:, 2].copy()
boxes[:, 0] = widths[i] - oldx2 - 1 # 得到水平翻转后的框位置
boxes[:, 2] = widths[i] - oldx1 - 1
assert (boxes[:, 2] >= boxes[:, 0]).all()
entry = {'boxes': boxes,
'gt_overlaps': self.roidb[i]['gt_overlaps'],
'gt_classes': self.roidb[i]['gt_classes'],
'flipped': True}
self.roidb.append(entry) # 将翻转后的图添加到加载的原图后
self._image_index = self._image_index * 2 # 图片数量翻倍
4.7self.roidb[i]['boxes'].copy()得到原图每个物体的框gt
注释中描述了这个过程
def _load_pascal_annotation(self, index):
"""
Load image and bounding boxes info from XML file in the PASCAL VOC
format.
"""
filename = os.path.join(self._data_path, 'Annotations', index + '.xml')
tree = ET.parse(filename) # xml解析
objs = tree.findall('object')
if not self.config['use_diff']:
# Exclude the samples labeled as difficult
non_diff_objs = [
obj for obj in objs if int(obj.find('difficult').text) == 0]
# if len(non_diff_objs) != len(objs):
# print 'Removed {} difficult objects'.format(
# len(objs) - len(non_diff_objs))
objs = non_diff_objs
num_objs = len(objs) # 读当前文件里有多少物体
boxes = np.zeros((num_objs, 4), dtype=np.uint16) # 按物体数量初始化存放框信息的列表
gt_classes = np.zeros((num_objs), dtype=np.int32) # 初始化给物体进行标注的列表
overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32) # 用来标注每个物体对应num_classes的哪个位置(即标注该物体属于哪类)
# "Seg" area for pascal is just the box area
seg_areas = np.zeros((num_objs), dtype=np.float32) # 存放物体的面积
# Load object bounding boxes into a data frame.
for ix, obj in enumerate(objs):
bbox = obj.find('bndbox')
# Make pixel indexes 0-based
x1 = float(bbox.find('xmin').text) - 1
y1 = float(bbox.find('ymin').text) - 1
x2 = float(bbox.find('xmax').text) - 1
y2 = float(bbox.find('ymax').text) - 1
cls = self._class_to_ind[obj.find('name').text.lower().strip()] # 把当前物体的类别转换成索引值
boxes[ix, :] = [x1, y1, x2, y2] # 把物体和他的框信息传入
gt_classes[ix] = cls
overlaps[ix, cls] = 1.0 # 类似one_hot编码
seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1)
overlaps = scipy.sparse.csr_matrix(overlaps)
return {'boxes': boxes,
'gt_classes': gt_classes,
'gt_overlaps': overlaps,
'flipped': False,
'seg_areas': seg_areas}
5.train(self)Train类的函数
建立网络,迭代训练
def train(self):
# Create session
tfconfig = tf.ConfigProto(allow_soft_placement=True)
tfconfig.gpu_options.allow_growth = True
sess = tf.Session(config=tfconfig)
with sess.graph.as_default():
tf.set_random_seed(cfg.FLAGS.rng_seed)
# 建立一个网络架构
layers = self.net.create_architecture(sess, "TRAIN", self.imdb.num_classes, tag='default')
loss = layers['total_loss']
lr = tf.Variable(cfg.FLAGS.learning_rate, trainable=False)
momentum = cfg.FLAGS.momentum
optimizer = tf.train.MomentumOptimizer(lr, momentum)
gvs = optimizer.compute_gradients(loss)
# ......
5.1 create_architecture()构建网络
- 先进行一些初始化,如anchor box大小、比例的初值的个数等
- self.build_network(sess, training)是核心,构建网络
def create_architecture(self, sess, mode, num_classes, tag=None, anchor_scales=(8, 16, 32), anchor_ratios=(0.5, 1, 2)):
self._image = tf.placeholder(tf.float32, shape=[self._batch_size, None, None, 3])
self._im_info = tf.placeholder(tf.float32, shape=[self._batch_size, 3])
self._gt_boxes = tf.placeholder(tf.float32, shape=[None, 5])
self._tag = tag
self._num_classes = num_classes
self._mode = mode
self._anchor_scales = anchor_scales # anchor box的初始大小
self._num_scales = len(anchor_scales) # 几钟大小
self._anchor_ratios = anchor_ratios # anchor box的长宽比
self._num_ratios = len(anchor_ratios) # 几钟比例
self._num_anchors = self._num_scales * self._num_ratios # anchor box一共几钟类型
training = mode == 'TRAIN'
testing = mode == 'TEST'
assert tag != None
# handle most of the regularizer here
weights_regularizer = tf.contrib.layers.l2_regularizer(cfg.FLAGS.weight_decay)
if cfg.FLAGS.bias_decay:
biases_regularizer = weights_regularizer
else:
biases_regularizer = tf.no_regularizer
# list as many types of layers as possible, even if they are not used now
with arg_scope([slim.conv2d, slim.conv2d_in_plane,
slim.conv2d_transpose, slim.separable_conv2d, slim.fully_connected],
weights_regularizer=weights_regularizer,
biases_regularizer=biases_regularizer,
biases_initializer=tf.constant_initializer(0.0)):
rois, cls_prob, bbox_pred = self.build_network(sess, training) # 核心build_network
# ......
5.2 build_network(self, sess, is_training=True)构建网络
- build head创建卷积层
- build rpn创建rpn层
- build proposals创建过滤层
- build predictions创建全连接输出层
def build_network(self, sess, is_training=True):
with tf.variable_scope('vgg_16', 'vgg_16'):
# select initializer
if cfg.FLAGS.initializer == "truncated":
initializer = tf.truncated_normal_initializer(mean=0.0, stddev=0.01)
initializer_bbox = tf.truncated_normal_initializer(mean=0.0, stddev=0.001)
else:
initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01)
initializer_bbox = tf.random_normal_initializer(mean=0.0, stddev=0.001)
# Build head
net = self.build_head(is_training)
# Build rpn
rpn_cls_prob, rpn_bbox_pred, rpn_cls_score, rpn_cls_score_reshape = self.build_rpn(net, is_training, initializer)
# Build proposals
rois = self.build_proposals(is_training, rpn_cls_prob, rpn_bbox_pred, rpn_cls_score)
# Build predictions
cls_score, cls_prob, bbox_pred = self.build_predictions(net, rois, is_training, initializer, initializer_bbox)
self._predictions["rpn_cls_score"] = rpn_cls_score
self._predictions["rpn_cls_score_reshape"] = rpn_cls_score_reshape
self._predictions["rpn_cls_prob"] = rpn_cls_prob
self._predictions["rpn_bbox_pred"] = rpn_bbox_pred
self._predictions["cls_score"] = cls_score
self._predictions["cls_prob"] = cls_prob
self._predictions["bbox_pred"] = bbox_pred
self._predictions["rois"] = rois
self._score_summaries.update(self._predictions)
return rois, cls_prob, bbox_pred
5.3 build_head()创建卷积层
这里是一个ZF网络,5个卷积4个池化,经过该网络的特征图与原图大小是1:16
def build_head(self, is_training):
# Main network
# Layer 1
net = slim.repeat(self._image, 2, slim.conv2d, 64, [3, 3], trainable=False, scope='conv1')
net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool1')
# Layer 2
net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], trainable=False, scope='conv2')
net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool2')
# Layer 3
net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], trainable=is_training, scope='conv3')
net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool3')
# Layer 4
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], trainable=is_training, scope='conv4')
net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool4')
# Layer 5
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], trainable=is_training, scope='conv5')
# Append network to summaries
self._act_summaries.append(net)
# Append network as head layer
self._layers['head'] = net
return net
5.4 build_rpn()创建RPN层
见注释的标注
def build_rpn(self, net, is_training, initializer):
# Build anchor component
self._anchor_component()
# Create RPN Layer
rpn = slim.conv2d(net, 512, [3, 3], trainable=is_training, weights_initializer=initializer, scope="rpn_conv/3x3")
# 做框的分类任务 self._num_anchors * 2:一个框有前景或背景两类
self._act_summaries.append(rpn)
rpn_cls_score = slim.conv2d(rpn, self._num_anchors * 2, [1, 1], trainable=is_training, weights_initializer=initializer, padding='VALID', activation_fn=None, scope='rpn_cls_score')
# Change it so that the score has 2 as its channel size
rpn_cls_score_reshape = self._reshape_layer(rpn_cls_score, 2, 'rpn_cls_score_reshape') # 18个结果
rpn_cls_prob_reshape = self._softmax_layer(rpn_cls_score_reshape, "rpn_cls_prob_reshape")
rpn_cls_prob = self._reshape_layer(rpn_cls_prob_reshape, self._num_anchors * 2, "rpn_cls_prob")
# 做框的回归任务self._num_anchors * 4:一个框的位置信息有4个坐标
rpn_bbox_pred = slim.conv2d(rpn, self._num_anchors * 4, [1, 1], trainable=is_training, weights_initializer=initializer, padding='VALID', activation_fn=None, scope='rpn_bbox_pred')
return rpn_cls_prob, rpn_bbox_pred, rpn_cls_score, rpn_cls_score_reshape
5.5 build_proposals()创建过滤层
如何过滤见第一节
def build_proposals(self, is_training, rpn_cls_prob, rpn_bbox_pred, rpn_cls_score):
if is_training:
rois, roi_scores = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
rpn_labels = self._anchor_target_layer(rpn_cls_score, "anchor")
# Try to have a deterministic order for the computing graph, for reproducibility
with tf.control_dependencies([rpn_labels]):
rois, _ = self._proposal_target_layer(rois, roi_scores, "rpn_rois")
else:
if cfg.FLAGS.test_mode == 'nms':
rois, _ = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
elif cfg.FLAGS.test_mode == 'top':
rois, _ = self._proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
else:
raise NotImplementedError
return rois
5.6 build_predictions()创建全连接输出层
def build_predictions(self, net, rois, is_training, initializer, initializer_bbox):
# Crop image ROIs
pool5 = self._crop_pool_layer(net, rois, "pool5")
pool5_flat = slim.flatten(pool5, scope='flatten')
# Fully connected layers
fc6 = slim.fully_connected(pool5_flat, 4096, scope='fc6')
if is_training:
fc6 = slim.dropout(fc6, keep_prob=0.5, is_training=True, scope='dropout6')
fc7 = slim.fully_connected(fc6, 4096, scope='fc7')
if is_training:
fc7 = slim.dropout(fc7, keep_prob=0.5, is_training=True, scope='dropout7')
# Scores and predictions
cls_score = slim.fully_connected(fc7, self._num_classes, weights_initializer=initializer, trainable=is_training, activation_fn=None, scope='cls_score')
cls_prob = self._softmax_layer(cls_score, "cls_prob")
bbox_prediction = slim.fully_connected(fc7, self._num_classes * 4, weights_initializer=initializer_bbox, trainable=is_training, activation_fn=None, scope='bbox_pred')
return cls_score, cls_prob, bbox_prediction
最后便是添加损失函数和优化器开始迭代训练