Tensorflow—Faster RCNN网络(faster rcnn的训练)(三)
在https://blog.csdn.net/weixin_42206075/article/details/111984086照片blog中,已经介绍了第一大部分:rpn_train的代码。通过step1_train_rpn.py,我们训练了RPN网络(分别每次放入256个样本进行训练),然后根据训练好的RPN网络在每个训练图片中选取2000个ROI区域,后续将该img_num * 2000个左右的ROIs放入faster rcnn的主干分类与回归网络中进行训练,那这篇blog就是实现该模块的~
fast_rcnn\config.py
CLASSES = ["aeroplane", "bicycle", "bird", "boat", "bottle",
"bus", "car", "cat", "chair", "cow",
"diningtable", "dog", "horse", "motorbike", "person",
"pottedplant", "sheep", "sofa", "train", "tvmonitor"] #检测类别,共20种
POOLED_H = 7 #RoIPooling的池化尺寸高
POOLED_W = 7 #RoIPooling的池化尺寸宽
IMG_H = 600 #喂入faster rcnn网络图片的高
IMG_W = 800 #喂入faster rcnn网络图片的宽
MINIBATCH = 64 #faster rcnn训练正负样本的总数量
BATCHSIZE = 1 #训练图片的批次大小
LEARNING_RATE = 0.001 #初始学习率
WEIGHT_DECAY = 0.0005 #权重损失衰减系数
MOMENTUM = 0.9 #动量训练器参数
NUMS_PROPOSAL = 2000 #rpn网络中每张图片会选取ROI的框数量
NMS_THRESHOLD = 0.7 #faster rcnn训练时正样本划分nms的阈值
EPSILON = 1e-10 #平滑参数
XML_PATH = "./VOCdevkit/VOC2007/Annotations/" #xml文件路径
IMG_PATH = "./VOCdevkit/VOC2007/JPEGImages/" #image文件路径
***fast_rcnn\ops.py(xywh2x1y1x2y2函数)***
作用:将(64,4)的xywh格式数据转为xyxy格式数据,便于后续roi pooling
def xywh2x1y1x2y2(xywh):
x, y, w, h = xywh[:, 0:1], xywh[:, 1:2], xywh[:, 2:3], xywh[:, 3:4]
x1, y1 = x - w / 2, y - h / 2
x2, y2 = x + w / 2, y + h / 2
new_h, new_w = IMG_H // 16, IMG_W // 16
return tf.concat([y1 / 16 / new_h, x1 / 16 / new_w, y2 / 16 / new_h, x2 / 16 / new_w], axis=1)
fast_rcnn\vggnet.py
作用:构建VGG16网络,但是这里值得注意的是它与RPN网络中VGG16的区别:在fast_rcnn网络中,VGG16在进行4次2倍下采样之后,后面会紧跟着roi pooling,因为在rpn选取出的roi的框大小是不相同的,所以无法直接进行喂入网络中训练,需要用roi pooling进行大小格式的统一。同时,在该代码中,用两次全卷积代替VGG16中最后两层的全连接(论文中是进行了两次全连接)。
def vgg_16(inputs, boxes, box_idx, scope='vgg_16'):
"""Oxford Net VGG 16-Layers version D Example.
Note: All the fully_connected layers have been transformed to conv2d layers.
To use in classification mode, resize input to 224x224.
Args:
inputs: a tensor of size [batch_size, height, width, channels].
num_classes: number of predicted classes.
is_training: whether or not the model is being trained.
dropout_keep_prob: the probability that activations are kept in the dropout
layers during training.
spatial_squeeze: whether or not should squeeze the spatial dimensions of the
outputs. Useful to remove unnecessary dimensions for classification.
scope: Optional scope for the variables.
Returns:
the last op containing the log predictions and end_points dict.
"""
inputs -= tf.constant([123.68, 116.779, 103.939])
inputs /= 255
with variable_scope.variable_scope(scope, 'vgg_16', [inputs]) as sc:
end_points_collection = sc.original_name_scope + '_end_points'
# Collect outputs for conv2d, fully_connected and max_pool2d.
with arg_scope(
[layers.conv2d, layers_lib.fully_connected, layers_lib.max_pool2d],
outputs_collections=end_points_collection):
net = layers_lib.repeat(
inputs, 2, layers.conv2d, 64, [3, 3], scope='conv1', trainable=False)
net = layers_lib.max_pool2d(net, [2, 2], scope='pool1', padding="SAME")
net = layers_lib.repeat(net, 2, layers.conv2d, 128, [3, 3], scope='conv2', trainable=False)
net = layers_lib.max_pool2d(net, [2, 2], scope='pool2', padding="SAME")
net = layers_lib.repeat(net, 3, layers.conv2d, 256, [3, 3], scope='conv3')
net = layers_lib.max_pool2d(net, [2, 2], scope='pool3', padding="SAME")
net = layers_lib.repeat(net, 3, layers.conv2d, 512, [3, 3], scope='conv4')
net = layers_lib.max_pool2d(net, [2, 2], scope='pool4', padding="SAME")
net = layers_lib.repeat(net, 3, layers.conv2d, 512, [3, 3], scope='conv5') #shape=(1, 38, 50, 512)
net = roi_pooling(net, boxes, box_idx) #shape=(64, 7, 7, 512)
# Use conv2d instead of fully_connected layers.
net = layers.conv2d(net, 4096, [7, 7], padding='VALID', scope='fc6') # shape=(64, 1, 1, 4096)
net = layers_lib.dropout(
net, keep_prob=0.5, is_training=True, scope='dropout6')
net = layers.conv2d(net, 4096, [1, 1], scope='fc7') # shape=(64, 1, 1, 4096)
net = layers_lib.dropout(
net, keep_prob=0.5, is_training=True, scope='dropout7')
net = tf.squeeze(net, axis=[1, 2]) # shape=(64, 4096),进行维度为1的删除,类似于扁平化操作
return net
***fast_rcnn\ops.py(roi_pooling函数)***
作用:tf.image.crop_and_resize这个api是tf内置的进行roi pooling操作,是将不同shape的ROI区域进行池化,得到相同尺度的feature map,这里即(64,7,7,512)
def roi_pooling(inputs, boxes, box_idx):
inputs = tf.image.crop_and_resize(inputs, boxes, box_idx, [POOLED_H, POOLED_W]) # shape=(64, 7, 7, 512)
return inputs
***fast_rcnn\ops.py(fully_connected函数)***
作用:在进行两次初始化全连接(代码实现中是使用全卷积)之后,分别进行两次全连接,对应于:
—FC 21 用来分类,预测RoIs属于哪个类别(20个类+背景)
—FC 84 用来回归位置(21个类,每个类都有4个位置参数)
def fully_connected(name, inputs, nums_out):
nums_in = inputs.shape[-1]
with tf.variable_scope(name):
W = tf.get_variable("W", [nums_in, nums_out], initializer=tf.random_normal_initializer(mean=0., stddev=0.01))
b = tf.get_variable("b", [nums_out], initializer=tf.constant_initializer([0.]))
inputs = tf.matmul(inputs, W) + b
return inputs
fast_rcnn\networks.py
作用:返回faster rcnn的两个head:分类与回归,shape分别为(64, 21)和(64, 4)
def network(inputs, boxes, box_idx):
inputs = vgg_16(inputs, boxes, box_idx) # shape=(64, 4096)
cls = fully_connected("classification", inputs, len(CLASSES) + 1) #shape=(64, 21)
reg = fully_connected("regression", inputs, 4) #shape=(64, 4)
return cls, reg
***fast_rcnn\utils.py(generate_minibatch函数)***
作用:返回用于faster rcnn训练的一个批次的64个正负样本,具体返回数据可见代码最后
def generate_minibatch(proposal, gtbboxes, classes):
#gtbboxes: [None, 4]
proposal_x1 = proposal[:, 0] - proposal[:, 2]/2 # 左上角坐标x1,shape=(2000,)
proposal_x2 = proposal[:, 0] + proposal[:, 2]/2 # 右上角坐标x2,shape=(2000,)
proposal_y1 = proposal[:, 1] - proposal[:, 3]/2 # 左上角坐标y1,shape=(2000,)
proposal_y2 = proposal[:, 1] + proposal[:, 3]/2 # 左上角坐标y2,shape=(2000,)
'''将越界的ROI区域进行crop'''
proposal_x1[proposal_x1 < 0.] = 0
proposal_x2[proposal_x2 >= IMG_W] = IMG_W - 1
proposal_y1[proposal_y1 < 0.] = 0
proposal_y2[proposal_y2 >= IMG_H] = IMG_H - 1
x, y = (proposal_x1 + proposal_x2) / 2, (proposal_y1 + proposal_y2) / 2
w, h = proposal_x2 - proposal_x1, proposal_y2 - proposal_y1
proposal = np.stack((x, y, w, h), axis=1) #shape=(2000, 4)
ious = cal_ious(proposal, gtbboxes)#[nums_obj, nums_anchor],这里shape=(5,2000)
max_iou_idx = np.where(np.abs(ious - np.max(ious, axis=1, keepdims=True)) < 1e-3)[1] # 这里是获取每个真实框对应iou最大的anchor box,shape=(9,)
ious = np.max(ious, axis=0) #shape=(2000,),找出每个ROI区域对应所有的真实框,其中iou最大的
iou_greater_5_idx = np.where(ious >= 0.5)[0] # 获取所有ROI中与gt box的iou大于0.5的idx。注意的是,在rpn中,正负样本的iou判定是0.7
pos_idx = np.union1d(max_iou_idx, iou_greater_5_idx)
neg_idx = np.where(ious < 0.5)[0]
neg_idx_ = np.where(ious >= 0.1)[0]
neg_idx = np.intersect1d(neg_idx, neg_idx_)
neg_idx = np.setdiff1d(neg_idx, max_iou_idx) # 移除一些iou小于0.1,但是它对应的是最大iou值的roi box
pos_nums = pos_idx.shape[0]
neg_nums = neg_idx.shape[0]
if pos_nums < MINIBATCH//4:
remain_nums = MINIBATCH - pos_nums #负样本数
rand_idx = np.random.randint(0, neg_nums, [remain_nums]) #随机选取负样本idx
mini_batch_pos = proposal[pos_idx] #正样本
mini_batch_neg = proposal[neg_idx[rand_idx]] #负样本
mini_batch = np.concatenate((mini_batch_pos, mini_batch_neg), axis=0)
mask = np.concatenate((np.ones([pos_nums]), np.zeros([remain_nums])))
pos_iou = cal_ious(mini_batch_pos, gtbboxes)
pos_gt_idx = np.argmax(pos_iou, axis=0)
pos_gt_bbox = gtbboxes[pos_gt_idx]
pos_classes = classes[pos_gt_idx]
else:
rand_pos_idx = np.random.randint(0, pos_nums, [MINIBATCH//4]) # 若正样本的数量超过总训练样本的四分之一,那么就选取一张图片中64/4=16个正样本
rand_neg_idx = np.random.randint(0, neg_nums, [MINIBATCH * 3//4])
mini_batch_pos = proposal[pos_idx[rand_pos_idx]] #shape=(16,4)
mini_batch_neg = proposal[neg_idx[rand_neg_idx]] #shape=(48,4)
mini_batch = np.concatenate((mini_batch_pos, mini_batch_neg), axis=0)
mask = np.concatenate((np.ones([MINIBATCH//4]), np.zeros([MINIBATCH * 3//4])), axis=0) #shape=((64,)),创建标签
pos_iou = cal_ious(mini_batch_pos, gtbboxes) #shape=(5, 16)
pos_gt_idx = np.argmax(pos_iou, axis=0) #shape=(16,),找出每个ROI区域对应所有的真实框id,其中iou最大的
pos_gt_bbox = gtbboxes[pos_gt_idx] #shape=(16,4),记录所有正样本对应于所有真实框中最大的iou的真实框,便于后续进行偏移量转化
pos_classes = classes[pos_gt_idx] #shape=(16,)
target_bbox = bbox2offset(mini_batch_pos, pos_gt_bbox) #进行偏移量的转化,shape=(16,4)
init_target_bbox = np.zeros([MINIBATCH, 4])
init_target_classes = np.ones([MINIBATCH]) * len(CLASSES) #shape=(64,)
init_target_classes[:pos_classes.shape[0]] = pos_classes
init_target_bbox[:target_bbox.shape[0]] = target_bbox
# mini_batch:faster rcnn训练的一个批次正负样本总数,形式是xywh
# mask:一个批次64个样本相对应的标签,即为前景还是背景
# init_target_bbox:位置进行偏移量转化之后的,负样本全部为0
# init_target_classes:一个批次64个样本相对应的类别
return mini_batch, mask, init_target_bbox, init_target_classes
***fast_rcnn\utils.py(read_batch函数)***
作用:读取批次数据,具体返回:
# batch_imgs:一个批次输入的图片,shape =(1, 600, 800, 3)
# batch_proposal:在一个图片中选取的64个正负样本,shape=(64, 4)
# target_bboxes:位置进行偏移量转化之后的,负样本全部为0,shape=(64, 4)
# target_bboxes_idx:
# target_classes:一个批次64个样本相对应的类别
# masks:一个批次64个样本相对应的标签,即为前景还是背景
def read_batch(proposals):
xml_names = os.listdir(XML_PATH)
rand_idx = np.random.randint(0, len(xml_names), [BATCHSIZE])
batch_imgs = np.zeros([BATCHSIZE, IMG_H, IMG_W, 3])
batch_proposal = np.zeros([BATCHSIZE * MINIBATCH, 4])
masks = np.zeros([BATCHSIZE * MINIBATCH])
target_bboxes = np.zeros([BATCHSIZE * MINIBATCH, 4])
target_bboxes_idx = np.zeros([BATCHSIZE * MINIBATCH])
target_classes = np.zeros([BATCHSIZE * MINIBATCH])
for i in range(BATCHSIZE):
filename = xml_names[rand_idx[i]]
img, gtbboxes, class_labels = read_data(XML_PATH + filename, IMG_PATH + filename[:-4] + ".jpg") #(375, 500, 3) (5, 4) (5,)
img, gtbboxes = resize_img_bbox(img, gtbboxes) #(600, 800, 3) (5, 4)
proposal = proposals[filename] # (2000, 4)
mini_batch, mask, target_bbox, target_class = generate_minibatch(proposal, gtbboxes, class_labels)
batch_proposal[i*MINIBATCH:i*MINIBATCH+MINIBATCH] = mini_batch
masks[i*MINIBATCH:i*MINIBATCH+MINIBATCH] = mask
target_bboxes[i*MINIBATCH:i*MINIBATCH+MINIBATCH] = target_bbox
target_classes[i*MINIBATCH:i*MINIBATCH+MINIBATCH] = target_class
target_bboxes_idx[i*MINIBATCH:i*MINIBATCH+MINIBATCH] = i * np.ones([MINIBATCH])
batch_imgs[i] = img
# batch_imgs:一个批次输入的图片,shape =(1, 600, 800, 3)
# batch_proposal:在一个图片中选取的64个正负样本,shape=(64, 4)
# target_bboxes:位置进行偏移量转化之后的,负样本全部为0,shape=(64, 4)
# target_bboxes_idx:
# target_classes:一个批次64个样本相对应的类别
# masks:一个批次64个样本相对应的标签,即为前景还是背景
return batch_imgs, batch_proposal, target_bboxes, target_bboxes_idx, target_classes, masks
step2_train_fast_rcnn.py
作用:主文件
import tensorflow as tf
import scipy.io as sio
from fast_rcnn.networks import network
from fast_rcnn.ops import smooth_l1, xywh2x1y1x2y2
from fast_rcnn.utils import read_batch
from fast_rcnn.config import IMG_H, IMG_W, BATCHSIZE, MINIBATCH, EPSILON, WEIGHT_DECAY, LEARNING_RATE, MOMENTUM, CLASSES
proposals = sio.loadmat("./proposal.mat")
def train():
imgs = tf.placeholder(tf.float32, [BATCHSIZE, IMG_H, IMG_W, 3]) # shape=(1, 600, 800, 3)
batch_proposal = tf.placeholder(tf.float32, [BATCHSIZE * MINIBATCH, 4]) # shape=(64, 4),即在训练faster rcnn中每次选取64个样本,4表示(xywh)格式
target_bboxes = tf.placeholder(tf.float32, [BATCHSIZE * MINIBATCH, 4]) # shape=(64, 4)
target_bboxes_idx = tf.placeholder(tf.int32, [BATCHSIZE * MINIBATCH]) # for roi pooling,shape=(64,)
target_classes = tf.placeholder(tf.int32, [BATCHSIZE * MINIBATCH]) # shape=(64,)
masks = tf.placeholder(tf.float32, [BATCHSIZE * MINIBATCH]) # shape=(64,)
learning_rate = tf.placeholder(tf.float32)
batch_proposal_ = xywh2x1y1x2y2(batch_proposal)#for roi pooling,shape=(64, 4)
cls, reg = network(imgs, batch_proposal_, target_bboxes_idx) ##shape=(64, 21),shape=(64, 4)
one_hot = tf.one_hot(target_classes, len(CLASSES) + 1) # shape=(64, 21)
pos_nums = tf.reduce_sum(tf.cast(masks, dtype=tf.float32))
loss_cls = tf.reduce_sum(-tf.log(tf.reduce_sum(tf.nn.softmax(cls) * one_hot, axis=-1) + EPSILON)) / pos_nums
loss_reg = tf.reduce_sum(tf.reduce_sum(smooth_l1(reg, target_bboxes), axis=-1) * masks) / pos_nums
regular = tf.add_n([tf.nn.l2_loss(var) for var in tf.trainable_variables()])
total_loss = loss_cls + loss_reg + regular * WEIGHT_DECAY
with tf.variable_scope("Opt"):
Opt = tf.train.MomentumOptimizer(learning_rate, momentum=MOMENTUM).minimize(total_loss)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="vgg_16"))
saver.restore(sess, "./pretrained_VGG/vgg_16.ckpt")
saver = tf.train.Saver()
LR = LEARNING_RATE
for i in range(2):
if i > 30000:
LR = LR / 10
BATCH_IMGS, BATCH_PROPOSAL, TARGET_BBOXES, TARGET_BBOXES_IDX, TARGET_CLASSES, MASKS = read_batch(proposals)
sess.run(Opt, feed_dict={imgs: BATCH_IMGS, batch_proposal: BATCH_PROPOSAL, masks: MASKS,
target_bboxes: TARGET_BBOXES, target_bboxes_idx: TARGET_BBOXES_IDX,target_classes: TARGET_CLASSES, learning_rate: LR})
# if i % 100 == 0:
# [LOSS_CLS, LOSS_REG, TOTAL_LOSS] = sess.run([loss_cls, loss_reg, total_loss], feed_dict={imgs: BATCH_IMGS, batch_proposal: BATCH_PROPOSAL, masks: MASKS,
# target_bboxes: TARGET_BBOXES, target_bboxes_idx: TARGET_BBOXES_IDX,target_classes: TARGET_CLASSES, learning_rate: LR})
# print("Iteration: %d, total_loss: %f, loss_cls: %f, loss_reg: %f" % (i, TOTAL_LOSS, LOSS_CLS, LOSS_REG))
if i % 1000 == 0:
saver.save(sess, "./fast_rcnn/model/model.ckpt")
if __name__ == "__main__":
train()