- YOLOv1理论篇
- YOLOv1实践篇
工程框架:
在YOLOv1理论篇中我们简要介绍了YOLO的基本原理,本篇从代码角度进一步给出解析。工程结构如下:
config作为参数文件用于保存训练参数、测试参数、模型参数、路径参数等信息;
dataset/tfrecord用于xml数据和标签的解析以及TF格式文件的制作;
network用于网络模型的搭建;
loss_utils用于损失函数相关计算;
process_utils用于后处理及可视化方面的操作;
train/predict分别是训练代码和预测代码;
数据相关:
本次采用的是VOC2012数据集,其文件结构如下:
└── VOCdevkit #根目录
└── VOC2012 #不同年份的数据集,这里用的2012的
├── Annotations #存放xml文件,与JPEGImages中的图片一一对应,解释图片的内容等等
├── ImageSets #该目录下存放的都是txt文件,txt文件中每一行包含一个图片的名称,末尾会加上±1表示正负样本
│ ├── Action
│ ├── Layout
│ ├── Main
│ └── Segmentation
├── JPEGImages #存放源图片
├── SegmentationClass #存放的是图片,语义分割相关
└── SegmentationObject #存放的是图片,实例分割相关
目标检测任务中我们只涉及JPEGImages、Annotations、ImageSets三个文件夹:
- JPEGImages文件夹存放的是数据集的原图片;
- Annotation文件夹存放的是xml文件,主要介绍了对应图片的基本信息,如来自那个文件夹、文件名、来源、图像尺寸以及图像中包含哪些目标以及目标的信息等;
- ImageSets包括action layout main segmentation四个部分的内容,其中Action下存放的是人的动作; Layout下存放的是具有人体部位的数据; Segmentation下存放的是可用于分割的数据;我们这里只用到Main下存放的trainval.txt文件,记录了用于训练验证的数据名称;
工程代码实现如下:
我们在dataset.py文件中根据trainval.txt读取文件名并加载图片,缩放到网络输入指定尺寸448,并做归一化处理;
def load_image(self, image_num):
'''
依据image_num对相应的样本图片进行加载,同时执行resize操作,并对图像进行归一化操作
:param image_num: 图片编号
:return: 归一化后的图片数据
'''
image_path = os.path.join(self.data_path, 'JPEGImages', image_num+'.jpg')
image = cv2.imread(image_path)
self.h_ratio = 1.0 * self.image_size / image.shape[0]
self.w_ratio = 1.0 * self.image_size / image.shape[1]
image = cv2.resize(image, (self.image_size, self.image_size), interpolation=cv2.INTER_LINEAR)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
image = image / 255.0 * 2 - 1
if self.flipped == True:
return image, image[:, ::-1, :]
return image
根据trainval中读取的文件名解析xml形式的标签,将标签解析成[cell_size, cell_size, 5 + 20]的形式(cell是最终特征图尺寸,5表示x,y,w,h,conf,20表示VOC2012的类别个数):
def load_annotation(self, image_num):
'''
对单个xml文件进行解析,输出为该图片样本对应的label矩阵,label为三维矩阵[cell_size, cell_size, 5 + 20],
confidence取值方法为:当前单元格包含目标则为1,不包含目标为0;(x, y, w, h)为box的形状信息,中心坐标,宽,高,均以像素坐标的形式给出
:param image_num: 图片编号
:return: 归一化后的图片数据
'''
label = np.zeros([self.cell_size, self.cell_size, 5 + self.class_num], np.float32)
label_path = os.path.join(self.data_path, 'Annotations', image_num+'.xml')
if self.flipped:
label_flipped = np.zeros([self.cell_size, self.cell_size, 5 + self.class_num], np.float32)
tree = ET.parse(label_path)
root = tree.getroot()
# 得到某个xml_file文件中所有的object
objects = root.findall('object')
for object in objects:
bndbox = object.find('bndbox')
xmin = bndbox.find('xmin').text
ymin = bndbox.find('ymin').text
xmax = bndbox.find('xmax').text
ymax = bndbox.find('ymax').text
# 将原始样本的标定转换为resize后的图片的标定,按照等比例转换的方式,从0开始索引
x1 = max(min(float(xmin) * self.w_ratio, self.image_size - 1), 0)
y1 = max(min(float(ymin) * self.h_ratio, self.image_size - 1), 0)
x2 = max(min(float(xmax) * self.w_ratio, self.image_size - 1), 0)
y2 = max(min(float(ymax) * self.h_ratio, self.image_size - 1), 0)
# 将类别由字符串转换为对应的int数
class_index = self.class_ind[object.find('name').text.lower().strip()]
center_x = (x1 + x2) / 2.0
center_y = (y1 + y2) / 2.0
width = x2 - x1
height = y2 -y1
# 计算当前目标属于第几个cell,从0开始索引
center_x_index = int(center_x / self.image_size * self.cell_size)
center_y_index = int(center_y / self.image_size * self.cell_size)
# 对每个object,如果这个cell中有object了,则跳过标记
if label[center_y_index, center_x_index, 0] == 1:
continue
# 这里的x, y坐标是交换的,原因在于numpy的索引和图像的索引是颠倒的,在图像中0维索引列,1维索引行
label[center_y_index, center_x_index, 0] = 1
label[center_y_index, center_x_index, 1:5] = [center_x, center_y, width, height]
label[center_y_index, center_x_index, 5 + class_index] = 1
if self.flipped:
label_flipped[center_y_index, center_x_index, 0] = 1
label_flipped[center_y_index, center_x_index, 1:5] = [self.image_size - 1 - center_x, center_y, width, height]
label_flipped[center_y_index, center_x_index, 5 + class_index] = 1
if self.flipped:
return label, label_flipped[:, ::-1, :]
else:
return label
在tfrecord.py文件中采用Tensorflow的tfrecord格式制作数据:
def create_tfrecord(self):
# 获取作为训练验证集的图片序列
trainval_path = os.path.join(self.data_path, 'ImageSets', 'Main', 'trainval.txt')
if self.flipped:
tf_file = os.path.join(self.tfrecord_dir, self.train_tfrecord_name)
if not os.path.exists(tf_file):
# 循环写入每一张图像和标签到tfrecord文件
writer = tf.python_io.TFRecordWriter(tf_file)
with open(trainval_path, 'r') as read:
lines = read.readlines()
for line in lines:
image_num = line[0:-1]
# 获得当前样本数据和标签信息
image, image_flipped = self.dataset.load_image(image_num=image_num)
label, label_flipped = self.dataset.load_annotation(image_num=image_num)
# 转换为字符串
image_string = image.tostring()
image_flipped_string = image_flipped.tostring()
# 转换为字符串
label_string = label.tostring()
label_flipped_string = label_flipped.tostring()
example = tf.train.Example(features=tf.train.Features(
feature={
'image': self._bytes_feature(image_string),
'label': self._bytes_feature(label_string)}))
writer.write(example.SerializeToString())
example = tf.train.Example(features=tf.train.Features(
feature={
'image': self._bytes_feature(image_flipped_string),
'label': self._bytes_feature(label_flipped_string)}))
writer.write(example.SerializeToString())
writer.close()
print('Finish trainval.tfrecord Done')
else:
tf_file = os.path.join(self.tfrecord_dir, self.train_tfrecord_name)
if not os.path.exists(tf_file):
# 循环写入每一张图像和标签到tfrecord文件
writer = tf.python_io.TFRecordWriter(tf_file)
with open(trainval_path, 'r') as read:
lines = read.readlines()
for line in lines:
image_num = line[0:-1]
image = self.dataset.load_image(image_num)
label = self.dataset.load_annotation(image_num)
image_string = image.tostring()
label_string = label.tostring()
example = tf.train.Example(features=tf.train.Features(
feature={
'image': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_string])),
'label': tf.train.Feature(bytes_list=tf.train.BytesList(value=[label_string]))
}))
writer.write(example.SerializeToString())
writer.close()
print('Finish trainval.tfrecord Done')
Tensorflow采用文件队列的方式多线程加载数据:
def parse_single_example(self, file_name):
"""
:param file_name:待解析的tfrecord文件的名称
:return: 从文件中解析出的单个样本的相关特征,image, label
"""
tfrecord_file = os.path.join(self.tfrecord_dir, self.train_tfrecord_name)
# 定义解析TFRecord文件操作
reader = tf.TFRecordReader()
# 创建样本文件名称队列
filename_queue = tf.train.string_input_producer([tfrecord_file])
# 解析单个样本文件
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
features={
'image': tf.FixedLenFeature([], tf.string),
'label': tf.FixedLenFeature([], tf.string)
})
image = features['image']
label = features['label']
return image, label
def parse_batch_examples(self, file_name):
"""
:param file_name:待解析的tfrecord文件的名称
:return: 解析得到的batch_size个样本
"""
batch_size = self.batch_size
min_after_dequeue = 100
num_threads = 8
capacity = min_after_dequeue + 3 * batch_size
image, label = self.parse_single_example(file_name)
image_batch, label_batch = tf.train.shuffle_batch([image, label],
batch_size=batch_size,
num_threads=num_threads,
capacity=capacity,
min_after_dequeue=min_after_dequeue)
# 进行解码
image_batch = tf.decode_raw(image_batch, tf.float32)
label_batch = tf.decode_raw(label_batch, tf.float32)
# 转换为网络输入所要求的形状
image_batch = tf.reshape(image_batch, [self.batch_size, self.image_size, self.image_size, 3])
label_batch = tf.reshape(label_batch, [self.batch_size, self.cell_size, self.cell_size, 5 + self.class_num])
return image_batch, label_batch
网络模型:
YOLOv1的网络结构较清晰,特征提取部分类似于VGG的卷积块堆叠,输入尺寸448*448的Tensor经过6次下采样得到7*7大小的特征图,最后接三层全连接输出一维向量。代码如下:
def _build_network(self, inputs, scope='yolo_v1'):
"""
定义前向传播过程
:param inputs:待输入的样本图片
:param scope: 命名空间
:return: 网络最终的输出
"""
with tf.name_scope(scope):
with slim.arg_scope([slim.conv2d, slim.fully_connected],
activation_fn=self._leaky_relu(self.leaky_alpha),
weights_regularizer=slim.l2_regularizer(0.0005),
weights_initializer=tf.truncated_normal_initializer(0.0, 0.01)):
net = tf.pad(inputs, np.array([[0, 0], [3, 3], [3, 3], [0, 0]]), name='pad_1')
net = slim.conv2d(net, 64, 7, 2, padding='VALID', scope='conv_2')
net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_3')
# 112x112x64
net = slim.conv2d(net, 192, 3, scope='conv_4')
net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_5')
# 56x56x192
net = slim.conv2d(net, 128, 1, scope='conv_6')
net = slim.conv2d(net, 256, 3, scope='conv_7')
net = slim.conv2d(net, 256, 1, scope='conv_8')
net = slim.conv2d(net, 512, 3, scope='conv_9')
net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_10')
# 28x28x512
# 结构重复4次
net = slim.conv2d(net, 256, 1, scope='conv_11')
net = slim.conv2d(net, 512, 3, scope='conv_12')
net = slim.conv2d(net, 256, 1, scope='conv_13')
net = slim.conv2d(net, 512, 3, scope='conv_14')
net = slim.conv2d(net, 256, 1, scope='conv_15')
net = slim.conv2d(net, 512, 3, scope='conv_16')
net = slim.conv2d(net, 256, 1, scope='conv_17')
net = slim.conv2d(net, 512, 3, scope='conv_18')
net = slim.conv2d(net, 512, 1, scope='conv_19')
net = slim.conv2d(net, 1024, 3, scope='conv_20')
net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_21')
# 14x14x1024
# 结构重复2次
net = slim.conv2d(net, 512, 1, scope='conv_22')
net = slim.conv2d(net, 1024, 3, scope='conv_23')
net = slim.conv2d(net, 512, 1, scope='conv_24')
net = slim.conv2d(net, 1024, 3, scope='conv_25')
net = slim.conv2d(net, 1024, 3, scope='conv_26')
net = tf.pad(net, np.array([[0, 0], [1, 1], [1, 1], [0, 0]]), name='pad_27')
net = slim.conv2d(net, 1024, 3, 2, padding='VALID', scope='conv_28')
# 7x7x1024
net = slim.conv2d(net, 1024, 3, scope='conv_29')
net = slim.conv2d(net, 1024, 3, scope='conv_30')
# 7x7x1024
# 将上一层输出的张量展平为一维向量[image_size*image_size*image_channels]
net = slim.flatten(net, scope='flat_31')
net = slim.fully_connected(net, 512, scope='fc_32')
net = slim.fully_connected(net, 4096, scope='fc_33')
# 使用dropout避免过拟合
net = slim.dropout(net, keep_prob=self.keep_prob, is_training=self.is_train, scope='dropout_34')
# 最后一层全连接层输出最后的结果[cell_size*cell_size*(5*box_per_cell+class_num)]
logits = slim.fully_connected(net, self.output_size, activation_fn=None, scope='fc_35')
return logits
损失函数:
由于在上面数据处理时,已经将标签编码成[cell_size, cell_size, 5 + 20]的形式,这里将网络输出的结果也编码成网格形式,并且对预测结果和标签分别提取出类别,置信度,坐标信息:
提取predicts信息:
- predicts_boxes 信息(7*7*2*4)
- predicts_classes 信息(7*7*20)
- predicts_scales 信息(7*7*2)
提取labels信息:
- labels_response 信息(7*7*1)
- labels_boxes 信息(7*7*2*4)
- labels_classes 信息(7*7*20)
def loss_layer(self, predicts, labels, scope='loss'):
# 预测坐标:x, y中心点基于cell, sqrt(w),sqrt(h)基于全图0-1范围
with tf.name_scope('Predicts_Tensor'):
# 类别预测 predicts reshape ——> [batch_size, 7, 7, 20]
predicts_classes = tf.reshape(predicts[:, :self.boundary1], [self.batch_size, self.cell_size, self.cell_size, self.num_class])
# 置信度预测 predicts reshape ——> [batch_size, 7, 7, 2]
predicts_scales = tf.reshape(predicts[:, self.boundary1:self.boundary2], [self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell])
# 坐标预测 predicts reshape ——> [batch_size, 7, 7, 2, 4]
predicts_boxes = tf.reshape(predicts[:, self.boundary2:], [self.batch_size, self.cell_size, self.cell_size, self.boxes_per_cell, 4])
# 标签坐标: x, y, w, h 基于全图0-1范围
with tf.name_scope('Labels_Tensor'):
# labels reshape ——> [batch_size, 7, 7, 1] 哪个网格负责检测目标就标记为1
labels_response = tf.reshape(labels[..., 0], [self.batch_size, self.cell_size, self.cell_size, 1])
# 坐标标签 labels reshape ——> [batch_size, 7, 7, 2, 4] 网格内负责检测的外接框位置以图像大小为基准(x, y, width, height)
labels_boxes = tf.reshape(labels[..., 1:5], [self.batch_size, self.cell_size, self.cell_size, 1, 4])
labels_boxes = tf.tile(labels_boxes, [1, 1, 1, self.boxes_per_cell, 1]) / self.image_size
# 类别标签 labels reshape ——> [batch, 7, 7, 20]
labels_classes = labels[..., 5:]
with tf.variable_scope(scope):
# 类别损失
class_loss = self.class_loss(predicts_classes, labels_classes, labels_response)
# 基于cell的x, y 基于全图的sqrt(w), sqrt(h)——>基于全图的x, y, w, h
global_predict_boxes = self.predicts_to_labels_coord(predicts_boxes)
# 计算iou [batch , 7, 7, 2]
iou = self.calc_iou(global_predict_boxes, labels_boxes)
# 计算有目标和无目标掩码
object_mask, noobject_mask = self.calc_mask(iou, labels_response)
# 置信度损失
object_loss, noobject_loss = self.confidence_loss(predicts_scales, iou, object_mask, noobject_mask)
# 坐标损失
boxes_loss = self.coord_loss(predicts_boxes, labels_boxes, object_mask)
tf.losses.add_loss(class_loss)
tf.losses.add_loss(object_loss)
tf.losses.add_loss(noobject_loss)
tf.losses.add_loss(boxes_loss)
tf.summary.scalar('class_loss', class_loss)
tf.summary.scalar('object_loss', object_loss)
tf.summary.scalar('noobject_loss', noobject_loss)
tf.summary.scalar('boxes_loss', boxes_loss)
tf.summary.histogram('iou', iou)
这里的predicts_to_labels_coord函数基于cell的x, y和基于全图的sqrt(w), sqrt(h)转换成基于全图的x, y, w, h。然后分别计算预测和真值之间的定位损失,置信度损失,分类损失,YOLOv1中的损失基本都采用MSE计算,根据之前文章中提到,用平方差的方式做位置回归和分类都不是特别好,所以在后续的版本中将位置回归更换成iou及其变体,类别信息更换成交叉熵计算损失:
- 对于坐标损失只计算含目标的单元区;
- 对于置信度需要计算含目标和不含目标的两类损失值;
- 对于分类损失只计算含目标的单元数;
def class_loss(self, predicts_class, labels_class, labels_response):
"""
计算分类损失
:param predicts_class: 预测类别[batch, 7, 7, 20]
:param labels_class: 标签类别[batch, 7, 7, 20]
:param labels_response: cell中是否有目标[batch, 7, 7, 1]
:return:
"""
with tf.name_scope('class_loss'):
class_delta = labels_response * (predicts_class - labels_class)
class_loss = self.class_scale * tf.reduce_mean(tf.reduce_sum(tf.square(class_delta), axis=[1, 2, 3]), name='class_loss')
return class_loss
def confidence_loss(self, predicts_scale, iou, object_mask, noobject_mask):
'''
计算置信度损失
:param predicts_scale: 预测置信度 [batch, 7, 7, 2]
:param iou: iou结果 [batch, 7, 7, 2]
:param object_mask: 目标掩码 [batch, 7, 7, 2], 有目标位置为1,其余0
:param noobject_mask: 无目标掩码 [batch, 7, 7, 2], 无目标位置为1,其余0
:return:
'''
with tf.name_scope('confidence_loss'):
with tf.name_scope('object_confidence_loss'):
object_confidence_delta = object_mask * (predicts_scale - iou)
object_confidence_loss = self.object_scale * tf.reduce_mean(tf.reduce_sum(tf.square(object_confidence_delta), axis=[1, 2, 3]))
with tf.name_scope('noobject_confidence_loss'):
noobject_confidence_delta = noobject_mask * (predicts_scale - 0)
noobject_confidence_loss = self.noobject_scale * tf.reduce_mean(tf.reduce_sum(tf.square(noobject_confidence_delta), axis=[1, 2, 3]))
return object_confidence_loss, noobject_confidence_loss
def coord_loss(self, predicts_boxes, labels_boxes, object_mask):
'''
计算定位损失
:param predicts_boxes: 预测置位置 基于cell的x, y以及全图 sqrt(w), sqrt(h) [batch, 7, 7, 2, 4]
:param labels_boxes: 标签位置 基于全图的x, y, w, h [batch, 7, 7, 2, 4]
:param object_mask: 有目标的掩码 [batch, 7, 7, 2]
:return:
'''
with tf.name_scope('coord_loss'):
coord_mask = tf.expand_dims(object_mask, axis=-1)
cell_labals_boxes = self.labels_to_predicts_coord(labels_boxes)
coord_delta = coord_mask * (predicts_boxes - cell_labals_boxes)
boxes_loss = self.coord_scale * tf.reduce_mean(tf.reduce_sum(tf.square(coord_delta), axis=[1, 2, 3, 4]))
tf.summary.histogram('boxes_delta_x', coord_delta[..., 0])
tf.summary.histogram('boxes_delta_y', coord_delta[..., 1])
tf.summary.histogram('boxes_delta_w', coord_delta[..., 2])
tf.summary.histogram('boxes_delta_h', coord_delta[..., 3])
return boxes_loss
def labels_to_predicts_coord(self, labels_boxes):
# 得到x, y相对于该cell左上角的偏移值, 宽度和高度是相对于整张图片的比例
offset_axis_2 = tf.tile(tf.expand_dims(tf.range(7), axis=0), multiples=[7, 1])
offset_axis_2 = tf.tile(tf.reshape(offset_axis_2, shape=[1, 7, 7, 1]), multiples=[self.batch_size, 1, 1, 2])
offset_axis_1 = tf.transpose(offset_axis_2, (0, 2, 1, 3))
offset_axis_2 = tf.cast(offset_axis_2, dtype=tf.float32)
offset_axis_1 = tf.cast(offset_axis_1, dtype=tf.float32)
x = labels_boxes[..., 0] * self.cell_size - offset_axis_2
y = labels_boxes[..., 1] * self.cell_size - offset_axis_1
sqrt_w = tf.sqrt(labels_boxes[..., 2])
sqrt_h = tf.sqrt(labels_boxes[..., 3])
cell_labals_boxes = tf.stack([x, y, sqrt_w, sqrt_h], axis=-1)
return cell_labals_boxes
def calc_mask(self, iou, response):
'''
计算目标/非目标掩码
:param iou: 2个BBox的iou [batch, 7, 7, 2]
:param response: [batch, 7, 7, 1]
:return: 有目标掩码[batch, 7, 7, 2] 无目标掩码[batch, 7, 7, 2]
'''
# 计算各个cell各自所预测的几个边界框中的IOU的最大值
object_mask = tf.reduce_max(iou, axis=-1, keep_dims=True)
# 其维度为[batch_size, 7, 7, 2] 如果cell中真实有目标,那么该cell内iou最大的那个框的相应位置为1(就是负责预测该框),其余为0
object_mask = tf.cast((iou >= object_mask), tf.float32)
# 首先得出当前cell中负责进行目标预测的框,再与真实的置信度进行点乘,得出真实的包含有目标的cell中负责进行目标预测的框.
object_mask = object_mask * response
# 没有目标的框其维度为[batch_size, 7 , 7, 2], 真实没有目标的区域都为1,真实有目标的区域为0
no_object_mask = tf.ones_like(object_mask, dtype=tf.float32) - object_mask
return object_mask, no_object_mask
训练代码:
在训练代码train.py中,我们对损失加入L2正则化,并使用指数衰减更新学习率:
def train():
start_step = 0
log_step = solver_params['log_step']
display_step = solver_params['display_step']
restore = solver_params['restore']
checkpoint_dir = path_params['checkpoints_dir']
checkpoints_name = path_params['checkpoints_name']
tfrecord_dir = path_params['tfrecord_dir']
tfrecord_name = path_params['train_tfrecord_name']
log_dir = path_params['logs_dir']
weights_file = path_params['weights_file']
# 配置GPU
gpu_options = tf.GPUOptions(allow_growth=True)
config = tf.ConfigProto(gpu_options=gpu_options)
# 解析得到训练样本以及标注
data = tfrecord.TFRecord()
train_tfrecord = os.path.join(tfrecord_dir, tfrecord_name)
image_batch, label_batch = data.parse_batch_examples(train_tfrecord)
# 定义输入的占位符
inputs = tf.placeholder(dtype=tf.float32, shape=[None, model_params['image_size'], model_params['image_size'], model_params['channels']], name='inputs')
outputs = tf.placeholder(dtype=tf.float32, shape=[None, model_params['cell_size'], model_params['cell_size'], 5 + model_params['num_classes']], name='outputs')
# 构建网络
Model = network.Network(is_train=True)
logits = Model._build_network(inputs)
# 计算损失函数
Losses = loss_utils.Loss(logits, outputs, 'loss')
loss_op = tf.losses.get_total_loss()
vars = tf.trainable_variables()
l2_reg_loss_op = tf.add_n([tf.nn.l2_loss(var) for var in vars]) * solver_params['weight_decay']
total_loss = loss_op + l2_reg_loss_op
tf.summary.scalar('total_loss', total_loss)
# 创建全局的步骤
global_step = tf.train.create_global_step()
# 设定变化的学习率
learning_rate = tf.train.exponential_decay(
solver_params['learning_rate'],
global_step,
solver_params['decay_steps'],
solver_params['decay_rate'],
solver_params['staircase'],
name='learning_rate')
# 设置优化器
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
# 采用的优化方法是随机梯度下降
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
train_op = slim.learning.create_train_op(total_loss, optimizer, global_step)
# 模型保存
save_variable = tf.global_variables()
saver = tf.train.Saver(save_variable, max_to_keep=1000)
# 配置tensorboard
summary_op = tf.summary.merge_all()
summary_writer = tf.summary.FileWriter(log_dir, graph=tf.get_default_graph(), flush_secs=60)
with tf.Session(config=config) as sess:
init_var_op = tf.global_variables_initializer()
sess.run(init_var_op)
if weights_file is not None:
print('Restoring weights from: ' + weights_file)
saver.restore(sess, weights_file)
if restore == True:
ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
if ckpt and ckpt.model_checkpoint_path:
stem = os.path.basename(ckpt.model_checkpoint_path)
restore_step = int(stem.split('.')[0].split('-')[-1])
start_step = restore_step
sess.run(global_step.assign(restore_step))
saver.restore(sess, ckpt.model_checkpoint_path)
print('Restoreing from {}'.format(ckpt.model_checkpoint_path))
else:
print("Failed to find a checkpoint")
coordinate = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coordinate, sess=sess)
summary_writer.add_graph(sess.graph)
for epoch in range(start_step + 1, solver_params['max_iter']):
start_time = time.time()
if coordinate.should_stop():
break
image, label = sess.run([image_batch, label_batch])
feed_dict = {inputs: image, outputs: label}
_, loss, current_global_step = sess.run([train_op, total_loss, global_step], feed_dict=feed_dict)
end_time = time.time()
if epoch % solver_params['save_step'] == 0:
save_path = saver.save(sess, os.path.join(checkpoint_dir, checkpoints_name), global_step=epoch)
print('Save modle into {}....'.format(save_path))
if epoch % log_step == 0:
summary = sess.run(summary_op, feed_dict=feed_dict)
summary_writer.add_summary(summary, global_step=epoch)
if epoch % display_step == 0:
per_iter_time = end_time - start_time
print("step:{:.0f} total_loss: {:.5f} {:.2f} s/iter".format(epoch, loss, per_iter_time))
coordinate.request_stop()
coordinate.join(threads)
sess.close()
测试代码:
测试代码predict.py中对输入图片resize到448*448尺寸,因为网络加入了全连接层,所以尺寸必须固定。然后做简单的归一化将0~255的像素值压缩到-1~+1,输入网络进行前向传播:
def predict(test_dir, checkpoints):
"""
本函数用于对测试
:param test_dir:待测试的目录
:param checkpoints:权重文件
:return:
"""
input = tf.placeholder(tf.float32, [None, model_params['image_size'], model_params['image_size'], model_params['channels']], name='input')
# 构建网络
Model = network.Network(is_train=False)
logits = Model._build_network(input)
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
saver.restore(sess, checkpoints)
file_list = os.listdir(test_dir)
for filename in file_list:
file = os.path.join(test_dir, filename)
image = cv2.imread(file)
image_width = np.shape(image)[0]
image_height = np.shape(image)[1]
image = cv2.resize(image, (model_params['image_size'], model_params['image_size']))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
image = (image / 255.0) * 2.0 - 1.0
batch_image = np.zeros([1, model_params['image_size'], model_params['image_size'], model_params['channels']])
batch_image[0, :, :, :] = image
output = sess.run(logits, feed_dict={input: batch_image})
result = post_processing(output)
for i in range(len(result)):
result[i][1] *= (1.0 * image_width / model_params['image_size'])
result[i][2] *= (1.0 * image_height / model_params['image_size'])
result[i][3] *= (1.0 * image_width / model_params['image_size'])
result[i][4] *= (1.0 * image_height / model_params['image_size'])
draw_results(file, result)
对网络的输出进行解析,通过类别置信度和非极大值抑制:
def post_processing(outputs):
"""
对网络的输出进行解析,通过类别置信度和非极大值抑制
:param: outputs:网络的原始输出
:return: 检测出的结果[box_num, x, y, w, h, prob]
"""
boundary1 = model_params['cell_size'] * model_params['cell_size'] * model_params['num_classes']
boundary2 = boundary1 + model_params['cell_size'] * model_params['cell_size'] * model_params['boxes_per_cell']
predict_class_prob = outputs[0:boundary1]
predict_class_prob = np.reshape(predict_class_prob, [model_params['cell_size'], model_params['cell_size'], model_params['num_classes']])
# 解析出是否存在目标的置信度
predict_confidence = outputs[boundary1:boundary2]
predict_confidence = np.reshape(predict_confidence, [model_params['cell_size'], model_params['cell_size'], model_params['boxes_per_cell']])
# 解析出bounding_box的参数信息,网络预测的bbox的中心坐标是相对于cell的偏移量
predict_bboxs = outputs[boundary2:]
predict_bboxs = np.reshape(predict_bboxs, [model_params['cell_size'], model_params['cell_size'], model_params['boxes_per_cell'], 4])
# 将网络所预测的bbox相对于cell的偏移量转换为bbox的中心坐标在图像中的比例
offset = np.array([np.arange(model_params['cell_size'])] * model_params['cell_size'] * model_params['boxes_per_cell'])
offset = np.transpose(
np.reshape(
offset,
[model_params['boxes_per_cell'], model_params['cell_size'], model_params['cell_size']]),
(1, 2, 0))
# 将中心坐标和宽,长转换为真实的像素值
# 首先将偏移量形式的中心坐标和平方根形式的宽高转换为比例形式
predict_bboxs[:, :, :, 0] += offset
predict_bboxs[:, :, :, 1] += np.transpose(offset, (1, 0, 2))
# 得到(x, y)相对于整张图片的位置比例
predict_bboxs[:, :, :, :2] = 1.0 * predict_bboxs[:, :, :, 0:2] / model_params['cell_size']
# 得到预测的宽度和高度乘以平方才能得到相对于整张图片的比例
predict_bboxs[:, :, :, 2:] = np.square(predict_bboxs[:, :, :, 2:])
# 得到相对于原图的坐标框
predict_bboxs = predict_bboxs * model_params['image_size']
# 计算得出cell中的各个预测框最终给出的概率值,prob=class_prob*confidence
prob = np.zeros([model_params['cell_size'], model_params['cell_size'], model_params['boxes_per_cell'], model_params['num_classes']])
for box in range(model_params['boxes_per_cell']):
for class_n in range(model_params['num_classes']):
prob[:, :, box, class_n] = predict_confidence[:, :, box] * predict_class_prob[:, :, class_n]
# #如果大于prob_threshold,那么其对应的位置为true,反正false
filter_probs = np.array(prob >= test_params['prob_threshold'], dtype='bool')
# 找到为true的地方,用1来表示true, false是0
filter_boxes = np.nonzero(filter_probs)
# 找到符合的类别置信度
probs_filtered = prob[filter_probs]
boxes_filtered = predict_bboxs[filter_boxes[0], filter_boxes[1], filter_boxes[2]]
# 若该cell类别置信度大于阈值,则只取类别置信度最大的那个框,一个cell只负责预测一个类别
classes_num_filtered = np.argmax(
filter_probs, axis=3)[
filter_boxes[0], filter_boxes[1], filter_boxes[2]]
# 类别置信度排序
argsort = np.array(np.argsort(probs_filtered))[::-1]
# 类别置信度排序
boxes_filtered = boxes_filtered[argsort]
# 找到符合条件的类别置信度,从大到小排序
probs_filtered = probs_filtered[argsort]
# 类别数过滤
classes_num_filtered = classes_num_filtered[argsort]
# 非极大值抑制算法
for i in range(len(boxes_filtered)):
if probs_filtered[i] == 0:
continue
for j in range(i + 1, len(boxes_filtered)):
if calculate_iou(boxes_filtered[i], boxes_filtered[j]) > test_params['iou_threshold']:
probs_filtered[j] = 0.0
filter_iou = np.array(probs_filtered > 0.0, dtype='bool')
# 经过阈值和非极大值抑制之后得到的框
boxes_filtered = boxes_filtered[filter_iou]
# 经过阈值和非极大值抑制之后得到的类别置信度
probs_filtered = probs_filtered[filter_iou]
# 经过非极大值抑制之后得到的类别,一个cell只负责预测一个类别
classes_num_filtered = classes_num_filtered[filter_iou]
result = []
for i in range(len(boxes_filtered)):
result.append(
[CLASSES[classes_num_filtered[i]],
boxes_filtered[i][0],
boxes_filtered[i][1],
boxes_filtered[i][2],
boxes_filtered[i][3],
probs_filtered[i]])
return result
代码细节讲解的比较简单,读者可以结合代码中的一些注释阅读,v1总体不太复杂,大家可以重写过一遍流程。