本文主要通过啃https://github.com/YunYang1994/tensorflow-yolov3代码来学习了下yolov3,此外从B站up主https://www.bilibili.com/video/BV1i64y1u7Zj?p=2学习了目标检测的基础知识,例如NMS,IOU,YOLO发展等。
一、网络结构,前向计算过程
上述结构图源自参考链接[6],个人总结以下特点:Darknet-53主要为残差网络结构;DBL网络块中利用卷积代替池化进行2倍下采样;3尺度输出,每一种尺度对应不同大小物体的检测;网络的输出维度与图片的输入维度无关。详细的网络结构可以参照参考链接[5],摘录代码如下:
基本网络块:DBL、RES、CONCAT、UPsample
def convolutional(input_data, filters_shape, trainable, name, downsample=False, activate=True, bn=True):
with tf.variable_scope(name):
if downsample:
pad_h, pad_w = (filters_shape[0] - 2) // 2 + 1, (filters_shape[1] - 2) // 2 + 1
paddings = tf.constant([[0, 0], [pad_h, pad_h], [pad_w, pad_w], [0, 0]])
input_data = tf.pad(input_data, paddings, 'CONSTANT')
strides = (1, 2, 2, 1)
padding = 'VALID'
else:
strides = (1, 1, 1, 1)
padding = "SAME"
#初始化卷积核权值
weight = tf.get_variable(name='weight', dtype=tf.float32, trainable=True,
shape=filters_shape, initializer=tf.random_normal_initializer(stddev=0.01))
conv = tf.nn.conv2d(input=input_data, filter=weight, strides=strides, padding=padding)
if bn:
conv = tf.layers.batch_normalization(conv, beta_initializer=tf.zeros_initializer(),
gamma_initializer=tf.ones_initializer(),
moving_mean_initializer=tf.zeros_initializer(),
moving_variance_initializer=tf.ones_initializer(), training=trainable)
else:
#初始化卷积核偏置
bias = tf.get_variable(name='bias', shape=filters_shape[-1], trainable=True,
dtype=tf.float32, initializer=tf.constant_initializer(0.0))
conv = tf.nn.bias_add(conv, bias)
if activate == True: conv = tf.nn.leaky_relu(conv, alpha=0.1)
return conv
#残差块 输入与经过1*1卷积与3*3卷积后的输出相加 最终input_data short_cut尺寸一样
def residual_block(input_data, input_channel, filter_num1, filter_num2, trainable, name):
short_cut = input_data
with tf.variable_scope(name):
input_data = convolutional(input_data, filters_shape=(1, 1, input_channel, filter_num1),
trainable=trainable, name='conv1')
input_data = convolutional(input_data, filters_shape=(3, 3, filter_num1, filter_num2),
trainable=trainable, name='conv2')
residual_output = input_data + short_cut
return residual_output
#图层叠加
def route(name, previous_output, current_output):
with tf.variable_scope(name):
output = tf.concat([current_output, previous_output], axis=-1)
return output
#上采样 resize和反卷积两种方式
def upsample(input_data, name, method="deconv"):
assert method in ["resize", "deconv"]
if method == "resize":
with tf.variable_scope(name):
input_shape = tf.shape(input_data)
output = tf.image.resize_nearest_neighbor(input_data, (input_shape[1] * 2, input_shape[2] * 2))
if method == "deconv":
numm_filter = input_data.shape.as_list()[-1]
output = tf.layers.conv2d_transpose(input_data, numm_filter, kernel_size=2, padding='same',
strides=(2,2), kernel_initializer=tf.random_normal_initializer())
return output
Darknet53结构
#假设输入图片为416*416*3
def darknet53(input_data, trainable):
with tf.variable_scope('darknet'):
#3*3*3*32卷积后 得到416*416*32
input_data = common.convolutional(input_data, filters_shape=(3, 3, 3, 32), trainable=trainable, name='conv0')
#3*3*32*64卷积后 得到208*208*64
input_data = common.convolutional(input_data, filters_shape=(3, 3, 32, 64),
trainable=trainable, name='conv1', downsample=True)
#一个残差块 经过1*1*64*32的卷积和3*3*32*64的卷积 得到208*208*64 然后相加 结果为208*208*64
for i in range(1):
input_data = common.residual_block(input_data, 64, 32, 64, trainable=trainable, name='residual%d' %(i+0))
#3*3*64*128卷积后 得到104*104*128
input_data = common.convolutional(input_data, filters_shape=(3, 3, 64, 128),
trainable=trainable, name='conv4', downsample=True)
#两个残差块 经过1*1*128*64的卷积和3*3*64*128的卷积 得到104*104*128 然后相加 结果为104*104*128
for i in range(2):
input_data = common.residual_block(input_data, 128, 64, 128, trainable=trainable, name='residual%d' %(i+1))
#3*3*128*256卷积后 得到52*52*256
input_data = common.convolutional(input_data, filters_shape=(3, 3, 128, 256),
trainable=trainable, name='conv9', downsample=True)
#八个残差块 经过1*1*256*128的卷积和3*3*128*256的卷积 得到52*52*256
for i in range(8):
input_data = common.residual_block(input_data, 256, 128, 256, trainable=trainable, name='residual%d' %(i+3))
route_1 = input_data
#3*3*256*512的卷积后 得到26*26*512的图像
input_data = common.convolutional(input_data, filters_shape=(3, 3, 256, 512),
trainable=trainable, name='conv26', downsample=True)
#八个残差块 经过1*1*512*256的卷积和3*3*256*512的卷积 得到26*26*512
for i in range(8):
input_data = common.residual_block(input_data, 512, 256, 512, trainable=trainable, name='residual%d' %(i+11))
route_2 = input_data
#3*3*512*1024的卷积后 得到13*13*1024的图像
input_data = common.convolutional(input_data, filters_shape=(3, 3, 512, 1024),
trainable=trainable, name='conv43', downsample=True)
#四个残差块 经过1*1*1024*512的卷积和3*3*512*1024的卷积 得到13*13*1024
for i in range(4):
input_data = common.residual_block(input_data, 1024, 512, 1024, trainable=trainable, name='residual%d' %(i+19))
#route_1 52*52*256, route_2 26*26*512, input_data 13*13*1024
return route_1, route_2, input_data
YOLOV3结构
try:
self.conv_lbbox, self.conv_mbbox, self.conv_sbbox = self.__build_nework(input_data)
except:
raise NotImplementedError("Can not build up yolov3 network!")
with tf.variable_scope('pred_sbbox'):
self.pred_sbbox = self.decode(self.conv_sbbox, self.anchors[0], self.strides[0])
with tf.variable_scope('pred_mbbox'):
self.pred_mbbox = self.decode(self.conv_mbbox, self.anchors[1], self.strides[1])
with tf.variable_scope('pred_lbbox'):
self.pred_lbbox = self.decode(self.conv_lbbox, self.anchors[2], self.strides[2])
def __build_nework(self, input_data):
#route_1 52*52*256, route_2 26*26*512, input_data 13*13*1024
route_1, route_2, input_data = backbone.darknet53(input_data, self.trainable)
#input_data 13*13*512
input_data = common.convolutional(input_data, (1, 1, 1024, 512), self.trainable, 'conv52')
#input_data 13*13*1024
input_data = common.convolutional(input_data, (3, 3, 512, 1024), self.trainable, 'conv53')
#input_data 13*13*512
input_data = common.convolutional(input_data, (1, 1, 1024, 512), self.trainable, 'conv54')
#input_data 13*13*1024
input_data = common.convolutional(input_data, (3, 3, 512, 1024), self.trainable, 'conv55')
#input_data 13*13*512
input_data = common.convolutional(input_data, (1, 1, 1024, 512), self.trainable, 'conv56')
#conv_lobj_branch 13*13*1024
conv_lobj_branch = common.convolutional(input_data, (3, 3, 512, 1024), self.trainable, name='conv_lobj_branch')
#conv_lbbox 13*13*256
conv_lbbox = common.convolutional(conv_lobj_branch, (1, 1, 1024, 3*(self.num_class + 5)),
trainable=self.trainable, name='conv_lbbox', activate=False, bn=False)
#input_data 13*13*256
input_data = common.convolutional(input_data, (1, 1, 512, 256), self.trainable, 'conv57')
#input_data 26*26*256
input_data = common.upsample(input_data, name='upsample0', method=self.upsample_method)
#input-data route2最内层拼接 26*26*(512+256)
with tf.variable_scope('route_1'):
input_data = tf.concat([input_data, route_2], axis=-1)
#input_data 26*26*256
input_data = common.convolutional(input_data, (1, 1, 768, 256), self.trainable, 'conv58')
#input_data 26*26*512
input_data = common.convolutional(input_data, (3, 3, 256, 512), self.trainable, 'conv59')
#input_data 26*26*256
input_data = common.convolutional(input_data, (1, 1, 512, 256), self.trainable, 'conv60')
#input_data 26*26*512
input_data = common.convolutional(input_data, (3, 3, 256, 512), self.trainable, 'conv61')
#input_data 26*26*256
input_data = common.convolutional(input_data, (1, 1, 512, 256), self.trainable, 'conv62')
#conv_mobj_branch 26*26*512
conv_mobj_branch = common.convolutional(input_data, (3, 3, 256, 512), self.trainable, name='conv_mobj_branch' )
#conv_mbbox 26*26*256
conv_mbbox = common.convolutional(conv_mobj_branch, (1, 1, 512, 3*(self.num_class + 5)),
trainable=self.trainable, name='conv_mbbox', activate=False, bn=False)
#input_data 26*26*128
input_data = common.convolutional(input_data, (1, 1, 256, 128), self.trainable, 'conv63')
#input_data 52*52*128
input_data = common.upsample(input_data, name='upsample1', method=self.upsample_method)
with tf.variable_scope('route_2'):
#input_data 26*26*(256+128)
input_data = tf.concat([input_data, route_1], axis=-1)
input_data = common.convolutional(input_data, (1, 1, 384, 128), self.trainable, 'conv64')
input_data = common.convolutional(input_data, (3, 3, 128, 256), self.trainable, 'conv65')
input_data = common.convolutional(input_data, (1, 1, 256, 128), self.trainable, 'conv66')
input_data = common.convolutional(input_data, (3, 3, 128, 256), self.trainable, 'conv67')
input_data = common.convolutional(input_data, (1, 1, 256, 128), self.trainable, 'conv68')
conv_sobj_branch = common.convolutional(input_data, (3, 3, 128, 256), self.trainable, name='conv_sobj_branch')
conv_sbbox = common.convolutional(conv_sobj_branch, (1, 1, 256, 3*(self.num_class + 5)),
trainable=self.trainable, name='conv_sbbox', activate=False, bn=False)
#conv_lbbox, conv_mbbox, conv_sbbox 13*13*256 26*26*256 52*52*256
return conv_lbbox, conv_mbbox, conv_sbbox
从网络整体来看,相当于将输入图片划分为13*13,26*26,52*52的区域,每一个区域经过网络预测3个框,对应255个输出[(tx,ty,tw,th,tc,prob*80),(tx,ty,tw,th,tc,prob*80),(tx,ty,tw,th,tc,prob*80)],注意这个输出(即conv_lbbox, conv_mbbox, conv_sbbox)并不是边框实际位置,网络输出还需要经过直接位置预测,得到原始图片尺寸的边框信息(x,y,w,h,conf,class)
摘录代码如下:conv_output为网络输出(如13*13*255),stride为下采样计量
def decode(self, conv_output, anchors, stride):
conv_shape = tf.shape(conv_output)
batch_size = conv_shape[0]
output_size = conv_shape[1]
anchor_per_scale = len(anchors)
conv_output = tf.reshape(conv_output, (batch_size, output_size, output_size, anchor_per_scale, 5 + self.num_class))
#网络输出坐标
conv_raw_dxdy = conv_output[:, :, :, :, 0:2]
#网络输出高宽
conv_raw_dwdh = conv_output[:, :, :, :, 2:4]
#网络输出置信度分数
conv_raw_conf = conv_output[:, :, :, :, 4:5]
conv_raw_prob = conv_output[:, :, :, :, 5: ]
y = tf.tile(tf.range(output_size, dtype=tf.int32)[:, tf.newaxis], [1, output_size])
x = tf.tile(tf.range(output_size, dtype=tf.int32)[tf.newaxis, :], [output_size, 1])
xy_grid = tf.concat([x[:, :, tf.newaxis], y[:, :, tf.newaxis]], axis=-1)
xy_grid = tf.tile(xy_grid[tf.newaxis, :, :, tf.newaxis, :], [batch_size, 1, 1, anchor_per_scale, 1])
xy_grid = tf.cast(xy_grid, tf.float32)
# stride 分别对应三种网格尺度 将网络的输出转为原始图片上的位置信息
pred_xy = (tf.sigmoid(conv_raw_dxdy) + xy_grid) * stride#每一个格点位置加上预测偏移 乘以stride映射到原始尺寸
pred_wh = (tf.exp(conv_raw_dwdh) * anchors) * stride#预测缩放系数乘以先验框 乘以stride映射到原始尺寸
pred_xywh = tf.concat([pred_xy, pred_wh], axis=-1)
#sigmoid的置信度
pred_conf = tf.sigmoid(conv_raw_conf)
pred_prob = tf.sigmoid(conv_raw_prob)
return tf.concat([pred_xywh, pred_conf, pred_prob], axis=-1)
直接位置预测之后,一张图片会计算出很多个预测框,对这些预测框进行后处理与NMS,得到最终的检测结果。后处理代码代码摘录如下:pred_bbox为最终预测框,org_img_shape输入图片原始大小,input_size为网络输入图片大小,score_threshold为分数阈值
def postprocess_boxes(pred_bbox, org_img_shape, input_size, score_threshold):
valid_scale=[0, np.inf]
pred_bbox = np.array(pred_bbox)
pred_xywh = pred_bbox[:, 0:4]
pred_conf = pred_bbox[:, 4]
pred_prob = pred_bbox[:, 5:]
# # (1) 坐标转换(x, y, w, h) --> (xmin, ymin, xmax, ymax)
pred_coor = np.concatenate([pred_xywh[:, :2] - pred_xywh[:, 2:] * 0.5,
pred_xywh[:, :2] + pred_xywh[:, 2:] * 0.5], axis=-1)
# # (2) 转换为原图坐标(xmin, ymin, xmax, ymax) -> (xmin_org, ymin_org, xmax_org, ymax_org)
org_h, org_w = org_img_shape
resize_ratio = min(input_size / org_w, input_size / org_h)
dw = (input_size - resize_ratio * org_w) / 2
dh = (input_size - resize_ratio * org_h) / 2
pred_coor[:, 0::2] = 1.0 * (pred_coor[:, 0::2] - dw) / resize_ratio
pred_coor[:, 1::2] = 1.0 * (pred_coor[:, 1::2] - dh) / resize_ratio
# # (3) 清除掉超出范围的边框
pred_coor = np.concatenate([np.maximum(pred_coor[:, :2], [0, 0]),
np.minimum(pred_coor[:, 2:], [org_w - 1, org_h - 1])], axis=-1)
invalid_mask = np.logical_or((pred_coor[:, 0] > pred_coor[:, 2]), (pred_coor[:, 1] > pred_coor[:, 3]))
pred_coor[invalid_mask] = 0
# # (4) 清除掉无效的边框 右下点比左上小
bboxes_scale = np.sqrt(np.multiply.reduce(pred_coor[:, 2:4] - pred_coor[:, 0:2], axis=-1))
scale_mask = np.logical_and((valid_scale[0] < bboxes_scale), (bboxes_scale < valid_scale[1]))
# # (5) 清除评分低的边框
classes = np.argmax(pred_prob, axis=-1)
scores = pred_conf * pred_prob[np.arange(len(pred_coor)), classes]
score_mask = scores > score_threshold
mask = np.logical_and(scale_mask, score_mask)
coors, scores, classes = pred_coor[mask], scores[mask], classes[mask]
return np.concatenate([coors, scores[:, np.newaxis], classes[:, np.newaxis]], axis=-1)
NMS代码摘录如下
#对每个类别的预测框进行box nms
def nms(bboxes, iou_threshold, sigma=0.3, method='nms'):
"""
:param bboxes: (xmin, ymin, xmax, ymax, score, class)
Note: soft-nms, https://arxiv.org/pdf/1704.04503.pdf
https://github.com/bharatsingh430/soft-nms
"""
classes_in_img = list(set(bboxes[:, 5]))
best_bboxes = []
for cls in classes_in_img:
cls_mask = (bboxes[:, 5] == cls)
cls_bboxes = bboxes[cls_mask]
while len(cls_bboxes) > 0:
max_ind = np.argmax(cls_bboxes[:, 4])
best_bbox = cls_bboxes[max_ind]
best_bboxes.append(best_bbox)
cls_bboxes = np.concatenate([cls_bboxes[: max_ind], cls_bboxes[max_ind + 1:]])
iou = bboxes_iou(best_bbox[np.newaxis, :4], cls_bboxes[:, :4])
weight = np.ones((len(iou),), dtype=np.float32)
assert method in ['nms', 'soft-nms']
if method == 'nms':
iou_mask = iou > iou_threshold
weight[iou_mask] = 0.0
if method == 'soft-nms':
weight = np.exp(-(1.0 * iou ** 2 / sigma))
cls_bboxes[:, 4] = cls_bboxes[:, 4] * weight
score_mask = cls_bboxes[:, 4] > 0.
cls_bboxes = cls_bboxes[score_mask]
return best_bboxes
二、数据准备
以voc数据集为例,voc数据集的图片信息以xml文件存储,第一步应解析xml文件获得数据集信息,每张图片的信息表示为['image path','xmin','ymin','xmax','ymax','classoid',...],最终得到voc_train.txt与voc_test.txt代码如下:
# -*- coding: utf-8 -*-
#xml文件读取可参考https://www.w3school.com.cn/xmldom/dom_nodes_info.asp
import xml.dom.minidom
import os
import parameter
testfile_path = "VOC2007_TEST/Annotations"
#列出文件夹下所有文件 保存至filelist列表
filelist = os.listdir(testfile_path)
with open("voc_test.txt",'w') as file_handle:
#先清空文件
file_handle.seek(0)
file_handle.truncate()
for xmlfile in filelist:
# 使用minidom解析器打开 XML 文档
DOMTree = xml.dom.minidom.parse(os.path.join(testfile_path,xmlfile))
collection = DOMTree.documentElement
#获取图片路径并写入
img_path = os.path.join("VOC2007_TEST/JPEGImages/",
collection.getElementsByTagName("filename")[0].childNodes[0].data)
file_handle.write(img_path)
file_handle.write(' ')
#获取图片中所有目标对象
objects = collection.getElementsByTagName("object")
#获取每个目标对象的类别与位置信息
for object_id in objects:
xmin = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('xmin')[0]\
.childNodes[0].data
file_handle.write(xmin)
file_handle.write(',')
ymin = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('ymin')[0]\
.childNodes[0].data
file_handle.write(ymin)
file_handle.write(',')
xmax = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('xmax')[0]\
.childNodes[0].data
file_handle.write(xmax)
file_handle.write(',')
ymax = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('ymax')[0]\
.childNodes[0].data
file_handle.write(ymax)
file_handle.write(',')
name = object_id.getElementsByTagName('name')[0].childNodes[0].data
class_id = parameter.voc_name.index(name)
file_handle.write(str(class_id))
file_handle.write(' ')
file_handle.write('\n')
print('VOC2007_TEST Read end!')
trainfile_path1 = "VOC2007_TRAIN/Annotations"
#列出文件夹下所有文件 保存至filelist列表
filelist = os.listdir(trainfile_path1)
with open("voc_train.txt",'w') as file_handle:
#先清空文件
file_handle.seek(0)
file_handle.truncate()
for xmlfile in filelist:
# 使用minidom解析器打开 XML 文档
DOMTree = xml.dom.minidom.parse(os.path.join(trainfile_path1,xmlfile))
collection = DOMTree.documentElement
#获取图片路径并写入
img_path = os.path.join("VOC2007_TRAIN/JPEGImages/",
collection.getElementsByTagName("filename")[0].childNodes[0].data)
file_handle.write(img_path)
file_handle.write(' ')
#获取图片中所有目标对象
objects = collection.getElementsByTagName("object")
#获取每个目标对象的类别与位置信息
for object_id in objects:
xmin = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('xmin')[0]\
.childNodes[0].data
file_handle.write(xmin)
file_handle.write(',')
ymin = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('ymin')[0]\
.childNodes[0].data
file_handle.write(ymin)
file_handle.write(',')
xmax = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('xmax')[0]\
.childNodes[0].data
file_handle.write(xmax)
file_handle.write(',')
ymax = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('ymax')[0]\
.childNodes[0].data
file_handle.write(ymax)
file_handle.write(',')
name = object_id.getElementsByTagName('name')[0].childNodes[0].data
class_id = parameter.voc_name.index(name)
file_handle.write(str(class_id))
file_handle.write(' ')
file_handle.write('\n')
print('VOC2007_TRAIN Read end!')
trainfile_path2 = "VOC2012_TRAIN/Annotations"
#列出文件夹下所有文件 保存至filelist列表
filelist = os.listdir(trainfile_path2)
#注意这里是文件追加
with open("voc_train.txt",'a') as file_handle:
for xmlfile in filelist:
# 使用minidom解析器打开 XML 文档
DOMTree = xml.dom.minidom.parse(os.path.join(trainfile_path2,xmlfile))
collection = DOMTree.documentElement
#获取图片路径并写入
img_path = os.path.join("VOC2012_TRAIN/JPEGImages/",
collection.getElementsByTagName("filename")[0].childNodes[0].data)
file_handle.write(img_path)
file_handle.write(' ')
#获取图片中所有目标对象
objects = collection.getElementsByTagName("object")
#获取每个目标对象的类别与位置信息
for object_id in objects:
xmin = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('xmin')[0]\
.childNodes[0].data
file_handle.write(xmin)
file_handle.write(',')
ymin = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('ymin')[0]\
.childNodes[0].data
file_handle.write(ymin)
file_handle.write(',')
xmax = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('xmax')[0]\
.childNodes[0].data
file_handle.write(xmax)
file_handle.write(',')
ymax = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('ymax')[0]\
.childNodes[0].data
file_handle.write(ymax)
file_handle.write(',')
name = object_id.getElementsByTagName('name')[0].childNodes[0].data
class_id = parameter.voc_name.index(name)
file_handle.write(str(class_id))
file_handle.write(' ')
file_handle.write('\n')
print('VOC2012_TRAIN Read end!')
接下来可以加载图片信息,进行相关处理,得到图片的对应标签。处理流程如下:
1、加载所有图片信息
#加载voc_train.txt或者voc_test.txt
def load_annotations(data_path):
with open(data_path, 'r') as f:
#列表形式读取来所有行
data = f.readlines()
#对每一行去除首尾空白符 并去除没有真实框的数据
annotations = [line.strip() for line in data if len(line.strip().split()[1:]) != 0]
#对数据进行打乱
np.random.shuffle(annotations)
return annotations
2、解析图片信息得到图片数据与边框数据,并对图片和边框进行数据增强和尺寸统一(原始图片尺寸不一致)
#数据增强 随机水平翻转
def random_horizontal_flip(image, bboxes):
if random.random() < 0.5:
_, w, _ = image.shape
# HWC格式 H不变 W坐标取反
image = image[:, ::-1, :]
# xmin xmax对应bboxes[:, [2,0]]
bboxes[:, [0,2]] = w - bboxes[:, [2,0]]
return image, bboxes
#数据增强 随机裁剪
def random_crop(image, bboxes):
if random.random() < 0.5:
h, w, _ = image.shape
#最小的xmin ymin 与最大的xmax ymax拼接为最大的max_bbox
max_bbox = np.concatenate([np.min(bboxes[:, 0:2], axis=0), np.max(bboxes[:, 2:4], axis=0)], axis=-1)
max_l_trans = max_bbox[0]
max_u_trans = max_bbox[1]
max_r_trans = w - max_bbox[2]
max_d_trans = h - max_bbox[3]
#截取图片的范围是包含了max_bbox
crop_xmin = max(0, int(max_bbox[0] - random.uniform(0, max_l_trans)))
crop_ymin = max(0, int(max_bbox[1] - random.uniform(0, max_u_trans)))
crop_xmax = max(w, int(max_bbox[2] + random.uniform(0, max_r_trans)))
crop_ymax = max(h, int(max_bbox[3] + random.uniform(0, max_d_trans)))
image = image[crop_ymin : crop_ymax, crop_xmin : crop_xmax]
#图片截取之后 相应的box坐标也需要做出调整 因为图像的原点变了
bboxes[:, [0, 2]] = bboxes[:, [0, 2]] - crop_xmin
bboxes[:, [1, 3]] = bboxes[:, [1, 3]] - crop_ymin
return image, bboxes
#数据增强 随机移动
def random_translate(image, bboxes):
if random.random() < 0.5:
h, w, _ = image.shape
max_bbox = np.concatenate([np.min(bboxes[:, 0:2], axis=0), np.max(bboxes[:, 2:4], axis=0)], axis=-1)
max_l_trans = max_bbox[0]
max_u_trans = max_bbox[1]
max_r_trans = w - max_bbox[2]
max_d_trans = h - max_bbox[3]
tx = random.uniform(-(max_l_trans - 1), (max_r_trans - 1))
ty = random.uniform(-(max_u_trans - 1), (max_d_trans - 1))
M = np.array([[1, 0, tx], [0, 1, ty]])
image = cv2.warpAffine(image, M, (w, h))
bboxes[:, [0, 2]] = bboxes[:, [0, 2]] + tx
bboxes[:, [1, 3]] = bboxes[:, [1, 3]] + ty
return image, bboxes
#图像与边框以最小比例缩放 并以128填充到目标尺寸 图片像素为HWC格式
def image_preporcess(image, target_size, gt_boxes=None):
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
ih, iw = target_size
h, w, _ = image.shape
scale = min(iw/w, ih/h)
nw, nh = int(scale * w), int(scale * h)
image_resized = cv2.resize(image, (nw, nh))
image_paded = np.full(shape=[ih, iw, 3], fill_value=128.0)
dw, dh = (iw - nw) // 2, (ih-nh) // 2
image_paded[dh:nh+dh, dw:nw+dw, :] = image_resized
image_paded = image_paded / 255.
if gt_boxes is None:
return image_paded
else:
gt_boxes[:, [0, 2]] = gt_boxes[:, [0, 2]] * scale + dw
gt_boxes[:, [1, 3]] = gt_boxes[:, [1, 3]] * scale + dh
return image_paded, gt_boxes
#根据解析的路径信息得到缩放后的图像数据和边框数据,此时边框数据为(xmin,ymin,xmax,ymax,class)
def parse_annotation(annotation,data_aug,train_input_size):
line = annotation.split()
image_path = line[0]
if not os.path.exists(image_path):
raise KeyError("%s does not exist ... " %image_path)
image = np.array(cv2.imread(image_path))
#对图像的边框信息转化为int类型,并存在二维数组里
bboxes = np.array([list(map(lambda x: int(float(x)), box.split(','))) for box in line[1:]])
#是否进行数据增强
if(data_aug):
image, bboxes = random_horizontal_flip(np.copy(image), np.copy(bboxes))
image, bboxes = random_crop(np.copy(image), np.copy(bboxes))
image, bboxes = random_translate(np.copy(image), np.copy(bboxes))
image, bboxes = image_preporcess(np.copy(image),
[train_input_size, train_input_size], np.copy(bboxes))
return image, bboxes
3、对图片的边框数据进行分析,得到三种网格划分情况下的标签,以及真实边框位置信息
#计算两个框的iou
def bbox_iou(boxes1, boxes2):
boxes1 = np.array(boxes1)
boxes2 = np.array(boxes2)
#两个边框的面积
boxes1_area = boxes1[..., 2] * boxes1[..., 3]
boxes2_area = boxes2[..., 2] * boxes2[..., 3]
#转换为(xmin,ymin,xmax,ymax)
boxes1 = np.concatenate([boxes1[..., :2] - boxes1[..., 2:] * 0.5,
boxes1[..., :2] + boxes1[..., 2:] * 0.5], axis=-1)
boxes2 = np.concatenate([boxes2[..., :2] - boxes2[..., 2:] * 0.5,
boxes2[..., :2] + boxes2[..., 2:] * 0.5], axis=-1)
#求相交区域内的顶点
left_up = np.maximum(boxes1[..., :2], boxes2[..., :2])
right_down = np.minimum(boxes1[..., 2:], boxes2[..., 2:])
#计算面积
inter_section = np.maximum(right_down - left_up, 0.0)
inter_area = inter_section[..., 0] * inter_section[..., 1]
union_area = boxes1_area + boxes2_area - inter_area
return inter_area / union_area
def preprocess_true_boxes(bboxes,train_output_sizes):
#初始化3种网格尺寸下的边框label列表 列表元素为13*13*3*25 26*26*3*25 52*52*3*25
label = [np.zeros((train_output_sizes[i], train_output_sizes[i], parameter.anchor_per_scale,
5 + parameter.num_classes)) for i in range(3)]
#每种网格尺寸下最多有150个边框,初始化3种网格尺寸下的边框位置信息 其实3种网格尺寸下边框信息都是一样的,如果满足iou>0.3的条件的话
bboxes_xywh = [np.zeros((parameter.max_bbox_per_scale, 4)) for _ in range(3)]
bbox_count = np.zeros((3,))
#对于每个边框进行处理
for bbox in bboxes:
bbox_coor = bbox[:4]
bbox_class_ind = bbox[4]
#按照边框类别,进行平滑独热编码
onehot = np.zeros(parameter.num_classes, dtype=np.float)
onehot[bbox_class_ind] = 1.0
uniform_distribution = np.full(parameter.num_classes, 1.0 / parameter.num_classes)
deta = 0.01
smooth_onehot = onehot * (1 - deta) + deta * uniform_distribution
#将边框位置信息转换为(x,y,w,h),即中心点和宽高
bbox_xywh = np.concatenate([(bbox_coor[2:] + bbox_coor[:2]) * 0.5, bbox_coor[2:] - bbox_coor[:2]], axis=-1)
# 按8,16,32下采样比例对中心点以及宽高进行缩放,shape = (3, 4) 3个对应下采样后的坐标值
bbox_xywh_scaled = 1.0 * bbox_xywh[np.newaxis, :] / parameter.strides[:, np.newaxis]
iou = []
exist_positive = False
#对于每种网格尺寸
for i in range(3):
anchors_xywh = np.zeros((parameter.anchor_per_scale, 4))
#中心点用的真实框的 宽高使用先验框的 每个尺度下3个框
anchors_xywh[:, 0:2] = np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32) + 0.5
anchors_xywh[:, 2:4] = parameter.anchors[i]
#计算先验框和真实框的iou,一个尺度下 一个真实框 对应3个先验框 iou_scale.shape =(,3)
iou_scale = bbox_iou(bbox_xywh_scaled[i][np.newaxis, :], anchors_xywh)
iou.append(iou_scale)
iou_mask = iou_scale > 0.3
if np.any(iou_mask):
#将边框中心点位置平移到以网格为基准的位置
xind, yind = np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32)
#该网格尺度下 对目标网格进行赋值 iou_mask为false的不进行赋值
label[i][yind, xind, iou_mask, :] = 0
label[i][yind, xind, iou_mask, 0:4] = bbox_xywh
label[i][yind, xind, iou_mask, 4:5] = 1.0
label[i][yind, xind, iou_mask, 5:] = smooth_onehot
bbox_ind = int(bbox_count[i] % parameter.max_bbox_per_scale)
bboxes_xywh[i][bbox_ind, :4] = bbox_xywh
bbox_count[i] += 1
exist_positive = True
#如果不存在iou>0.3的先验框 就对所有的先验框排序 选择iou最大的那个
if not exist_positive:
best_anchor_ind = np.argmax(np.array(iou).reshape(-1), axis=-1)
#对应哪个尺度
best_detect = int(best_anchor_ind / parameter.anchor_per_scale)
#对应尺度下的第几个框
best_anchor = int(best_anchor_ind % parameter.anchor_per_scale)
xind, yind = np.floor(bbox_xywh_scaled[best_detect, 0:2]).astype(np.int32)
label[best_detect][yind, xind, best_anchor, :] = 0
label[best_detect][yind, xind, best_anchor, 0:4] = bbox_xywh
label[best_detect][yind, xind, best_anchor, 4:5] = 1.0
label[best_detect][yind, xind, best_anchor, 5:] = smooth_onehot
bbox_ind = int(bbox_count[best_detect] % parameter.max_bbox_per_scale)
bboxes_xywh[best_detect][bbox_ind, :4] = bbox_xywh
bbox_count[best_detect] += 1
label_sbbox, label_mbbox, label_lbbox = label
sbboxes, mbboxes, lbboxes = bboxes_xywh
#返回三种网格尺寸下的标签(x,y,w,h,conf,class)以及真实边框位置(x,y,w,h)
return label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes
4、将输入数据与输出标签做成批次,以便进行训练
def __next__(annotations,batch_size):
with tf.device('/cpu:0'):
train_input_size = random.choice(parameter.train_input_size)
train_output_sizes = train_input_size // parameter.strides
#输入图像 NHWC
batch_image = np.zeros((batch_size, train_input_size, train_input_size, 3))
#对应三种网格划分下的输出batch_size*13*13*3*85 batch_size*26*26*3*85
batch_label_sbbox = np.zeros((batch_size, train_output_sizes[0], train_output_sizes[0],
parameter.anchor_per_scale, 5 + parameter.num_classes))
batch_label_mbbox = np.zeros((batch_size, train_output_sizes[1], train_output_sizes[1],
parameter.anchor_per_scale, 5 + parameter.num_classes))
batch_label_lbbox = np.zeros((batch_size, train_output_sizes[2], train_output_sizes[2],
parameter.anchor_per_scale, 5 + parameter.num_classes))
#对应3种网格划分下真实框的信息,真实框最多不超过150个
batch_sbboxes = np.zeros((batch_size, parameter.max_bbox_per_scale, 4))
batch_mbboxes = np.zeros((batch_size, parameter.max_bbox_per_scale, 4))
batch_lbboxes = np.zeros((batch_size, parameter.max_bbox_per_scale, 4))
num = 0
batch_count = 0
num_samples = len(annotations)
num_batchs = int(np.ceil(num_samples / batch_size))
if batch_count < num_batchs:
while num < batch_size:
index = batch_count * batch_size + num
if index >= num_samples:
index -= num_samples
annotation = annotations[index]
#从注释中得到图像和边框位置
image, bboxes = parse_annotation(annotation,True,train_input_size)
label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes \
= preprocess_true_boxes(bboxes,train_output_sizes)
batch_image[num, :, :, :] = image
batch_label_sbbox[num, :, :, :, :] = label_sbbox
batch_label_mbbox[num, :, :, :, :] = label_mbbox
batch_label_lbbox[num, :, :, :, :] = label_lbbox
batch_sbboxes[num, :, :] = sbboxes
batch_mbboxes[num, :, :] = mbboxes
batch_lbboxes[num, :, :] = lbboxes
num += 1
batch_count += 1
return batch_image, batch_label_sbbox, batch_label_mbbox, batch_label_lbbox, \
batch_sbboxes, batch_mbboxes, batch_lbboxes
else:
batch_count = 0
np.random.shuffle(annotations)
raise StopIteration
三、网络训练与损失函数
损失函数包括边界框GIOU损失函数、置信度focalloss损失函数,分类交叉熵损失函数,详情可以见参考链接[3]
#类别不平衡问题 https://blog.csdn.net/qq_34914551/article/details/89049001
def focal(self, target, actual, alpha=1, gamma=2):
focal_loss = alpha * tf.pow(tf.abs(target - actual), gamma)
return focal_loss
#GIOU说明 https://zhuanlan.zhihu.com/p/80600110
def bbox_giou(self, boxes1, boxes2):
boxes1 = tf.concat([boxes1[..., :2] - boxes1[..., 2:] * 0.5,
boxes1[..., :2] + boxes1[..., 2:] * 0.5], axis=-1)
boxes2 = tf.concat([boxes2[..., :2] - boxes2[..., 2:] * 0.5,
boxes2[..., :2] + boxes2[..., 2:] * 0.5], axis=-1)
boxes1 = tf.concat([tf.minimum(boxes1[..., :2], boxes1[..., 2:]),
tf.maximum(boxes1[..., :2], boxes1[..., 2:])], axis=-1)
boxes2 = tf.concat([tf.minimum(boxes2[..., :2], boxes2[..., 2:]),
tf.maximum(boxes2[..., :2], boxes2[..., 2:])], axis=-1)
boxes1_area = (boxes1[..., 2] - boxes1[..., 0]) * (boxes1[..., 3] - boxes1[..., 1])
boxes2_area = (boxes2[..., 2] - boxes2[..., 0]) * (boxes2[..., 3] - boxes2[..., 1])
left_up = tf.maximum(boxes1[..., :2], boxes2[..., :2])
right_down = tf.minimum(boxes1[..., 2:], boxes2[..., 2:])
inter_section = tf.maximum(right_down - left_up, 0.0)
inter_area = inter_section[..., 0] * inter_section[..., 1]
union_area = boxes1_area + boxes2_area - inter_area
iou = inter_area / union_area
enclose_left_up = tf.minimum(boxes1[..., :2], boxes2[..., :2])
enclose_right_down = tf.maximum(boxes1[..., 2:], boxes2[..., 2:])
enclose = tf.maximum(enclose_right_down - enclose_left_up, 0.0)
enclose_area = enclose[..., 0] * enclose[..., 1]
giou = iou - 1.0 * (enclose_area - union_area) / enclose_area
return giou
#conv 网络输出 pred 直接位置预测 label 标签 bboxes
def loss_layer(self, conv, pred, label, bboxes, anchors, stride):
conv_shape = tf.shape(conv)
batch_size = conv_shape[0]
output_size = conv_shape[1]
input_size = stride * output_size
conv = tf.reshape(conv, (batch_size, output_size, output_size,
self.anchor_per_scale, 5 + self.num_class))
conv_raw_conf = conv[:, :, :, :, 4:5]
conv_raw_prob = conv[:, :, :, :, 5:]
pred_xywh = pred[:, :, :, :, 0:4]
pred_conf = pred[:, :, :, :, 4:5]
label_xywh = label[:, :, :, :, 0:4]
respond_bbox = label[:, :, :, :, 4:5]
label_prob = label[:, :, :, :, 5:]
#giouloss计算
giou = tf.expand_dims(self.bbox_giou(pred_xywh, label_xywh), axis=-1)
input_size = tf.cast(input_size, tf.float32)
#2-相对面积
bbox_loss_scale = 2.0 - 1.0 * label_xywh[:, :, :, :, 2:3] * label_xywh[:, :, :, :, 3:4] / (input_size ** 2)
giou_loss = respond_bbox * bbox_loss_scale * (1- giou)
iou = self.bbox_iou(pred_xywh[:, :, :, :, np.newaxis, :], bboxes[:, np.newaxis, np.newaxis, np.newaxis, :, :])
#取每个格子三个预测框中iou最大的
max_iou = tf.expand_dims(tf.reduce_max(iou, axis=-1), axis=-1)
## 如果最大的 iou 小于阈值,那么认为该预测框不包含物体,则为背景框
respond_bgd = (1.0 - respond_bbox) * tf.cast( max_iou < self.iou_loss_thresh, tf.float32 )
# 计算置信度的损失
conf_focal = self.focal(respond_bbox, pred_conf)
conf_loss = conf_focal * (
respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits(labels=respond_bbox, logits=conv_raw_conf)
+
respond_bgd * tf.nn.sigmoid_cross_entropy_with_logits(labels=respond_bbox, logits=conv_raw_conf)
)
#类别损失
prob_loss = respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits(labels=label_prob, logits=conv_raw_prob)
giou_loss = tf.reduce_mean(tf.reduce_sum(giou_loss, axis=[1,2,3,4]))
conf_loss = tf.reduce_mean(tf.reduce_sum(conf_loss, axis=[1,2,3,4]))
prob_loss = tf.reduce_mean(tf.reduce_sum(prob_loss, axis=[1,2,3,4]))
return giou_loss, conf_loss, prob_loss
四、一个小应用,利用https://github.com/YunYang1994/tensorflow-yolov3的权值文件,重新构建yolov3网络前向过程,将江南style视频变成江南style-yolov3模式,完整代码如下,视频结果可见https://www.bilibili.com/video/BV1754y1S7yF/
# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy as np
import cv2
import colorsys
import random
from PIL import Image
from ffmpy import FFmpeg
config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
config.gpu_options.per_process_gpu_memory_fraction = 0.4
sess = tf.Session(config=config)
#读取目标类别
def read_class_names(class_file_name):
names = {}
with open(class_file_name, 'r') as data:
for ID, name in enumerate(data):
names[ID] = name.strip('\n')
return names
#输入图像处理 统一尺寸
def image_preporcess(image, target_size, gt_boxes=None):
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
ih, iw = target_size
h, w, _ = image.shape
scale = min(iw/w, ih/h)
nw, nh = int(scale * w), int(scale * h)
image_resized = cv2.resize(image, (nw, nh))
image_paded = np.full(shape=[ih, iw, 3], fill_value=128.0)
dw, dh = (iw - nw) // 2, (ih-nh) // 2
image_paded[dh:nh+dh, dw:nw+dw, :] = image_resized
image_paded = image_paded / 255.
if gt_boxes is None:
return image_paded
else:
gt_boxes[:, [0, 2]] = gt_boxes[:, [0, 2]] * scale + dw
gt_boxes[:, [1, 3]] = gt_boxes[:, [1, 3]] * scale + dh
return image_paded, gt_boxes
#YOLOV3结构 网络输入为图片数据 网络输出为预测框 pred_sbbox pred_mbbox pred_lbbox
class YOLOV3(object):#object 表示新式类
def __init__(self,input_data,trainable):#self是类的实例引用
self.trainable = trainable
self.classes = read_class_names('coco.names')
self.num_class = len(self.classes)
self.strides = np.array([8, 16, 32])
self.anchors = np.array([1.25,1.625, 2.0,3.75, 4.125,2.875,
1.875,3.8125, 3.875,2.8125, 3.6875,7.4375,
3.625,2.8125, 4.875,6.1875, 11.65625,10.1875]).reshape(3,3,2)
self.anchor_per_scale = 3
self.iou_loss_thresh = 0.5
self.upsample_method = 'resize'
self.conv_lbbox, self.conv_mbbox, self.conv_sbbox = self.__build_nework(input_data)
with tf.variable_scope('pred_sbbox'):
self.pred_sbbox = self.decode(self.conv_sbbox, self.anchors[0], self.strides[0])
with tf.variable_scope('pred_mbbox'):
self.pred_mbbox = self.decode(self.conv_mbbox, self.anchors[1], self.strides[1])
with tf.variable_scope('pred_lbbox'):
self.pred_lbbox = self.decode(self.conv_lbbox, self.anchors[2], self.strides[2])
#DBL模块
def convolutional(self, input_data, filters_shape, trainable, name, downsample=False, activate=True, bn=True):
with tf.variable_scope(name):
if downsample:
pad_h, pad_w = (filters_shape[0] - 2) // 2 + 1, (filters_shape[1] - 2) // 2 + 1
paddings = tf.constant([[0, 0], [pad_h, pad_h], [pad_w, pad_w], [0, 0]])
input_data = tf.pad(input_data, paddings, 'CONSTANT')
strides = (1, 2, 2, 1)
padding = 'VALID'
else:
strides = (1, 1, 1, 1)
padding = "SAME"
weight = tf.get_variable(name='weight', dtype=tf.float32, trainable=True,
shape=filters_shape, initializer=tf.random_normal_initializer(stddev=0.01))
conv = tf.nn.conv2d(input=input_data, filter=weight, strides=strides, padding=padding)
if bn:
conv = tf.layers.batch_normalization(conv, beta_initializer=tf.zeros_initializer(),
gamma_initializer=tf.ones_initializer(),
moving_mean_initializer=tf.zeros_initializer(),
moving_variance_initializer=tf.ones_initializer(), training=trainable)
else:
bias = tf.get_variable(name='bias', shape=filters_shape[-1], trainable=True,
dtype=tf.float32, initializer=tf.constant_initializer(0.0))
conv = tf.nn.bias_add(conv, bias)
if activate == True: conv = tf.nn.leaky_relu(conv, alpha=0.1)
return conv
#resnet模块
def residual_block(self, input_data, input_channel, filter_num1, filter_num2, trainable, name):
short_cut = input_data
with tf.variable_scope(name):
input_data = self.convolutional(input_data, filters_shape=(1, 1, input_channel, filter_num1),
trainable=trainable, name='conv1')
input_data = self.convolutional(input_data, filters_shape=(3, 3, filter_num1, filter_num2),
trainable=trainable, name='conv2')
residual_output = input_data + short_cut
return residual_output
#concat模块
def route(self, name, previous_output, current_output):
with tf.variable_scope(name):
output = tf.concat([current_output, previous_output], axis=-1)
return output
#上采样模块
def upsample(self, input_data, name, method="deconv"):
assert method in ["resize", "deconv"]
if method == "resize":
with tf.variable_scope(name):
input_shape = tf.shape(input_data)
output = tf.image.resize_nearest_neighbor(input_data, (input_shape[1] * 2, input_shape[2] * 2))
if method == "deconv":
# replace resize_nearest_neighbor with conv2d_transpose To support TensorRT optimization
numm_filter = input_data.shape.as_list()[-1]
output = tf.layers.conv2d_transpose(input_data, numm_filter, kernel_size=2, padding='same',
strides=(2,2), kernel_initializer=tf.random_normal_initializer())
return output
def __build_nework(self, input_data):
with tf.variable_scope('darknet'):
input_data = self.convolutional(input_data, filters_shape=(3, 3, 3, 32),
trainable=self.trainable, name='conv0')
input_data = self.convolutional(input_data, filters_shape=(3, 3, 32, 64),
trainable=self.trainable, name='conv1', downsample=True)
for i in range(1):
input_data = self.residual_block(input_data, 64, 32, 64,
trainable=self.trainable, name='residual%d' %(i+0))
input_data = self.convolutional(input_data, filters_shape=(3, 3, 64, 128),
trainable=self.trainable, name='conv4', downsample=True)
for i in range(2):
input_data = self.residual_block(input_data, 128, 64, 128,
trainable=self.trainable, name='residual%d' %(i+1))
input_data = self.convolutional(input_data, filters_shape=(3, 3, 128, 256),
trainable=self.trainable, name='conv9', downsample=True)
for i in range(8):
input_data = self.residual_block(input_data, 256, 128, 256,
trainable=self.trainable, name='residual%d' %(i+3))
route_1 = input_data
input_data = self.convolutional(input_data, filters_shape=(3, 3, 256, 512),
trainable=self.trainable, name='conv26', downsample=True)
for i in range(8):
input_data = self.residual_block(input_data, 512, 256, 512,
trainable=self.trainable, name='residual%d' %(i+11))
route_2 = input_data
input_data = self.convolutional(input_data, filters_shape=(3, 3, 512, 1024),
trainable=self.trainable, name='conv43', downsample=True)
for i in range(4):
input_data = self.residual_block(input_data, 1024, 512, 1024,
trainable=self.trainable, name='residual%d' %(i+19))
input_data = self.convolutional(input_data, (1, 1, 1024, 512), trainable=self.trainable, name = 'conv52')
input_data = self.convolutional(input_data, (3, 3, 512, 1024), trainable=self.trainable, name = 'conv53')
input_data = self.convolutional(input_data, (1, 1, 1024, 512), trainable=self.trainable, name = 'conv54')
input_data = self.convolutional(input_data, (3, 3, 512, 1024), trainable=self.trainable, name = 'conv55')
input_data = self.convolutional(input_data, (1, 1, 1024, 512), trainable=self.trainable, name = 'conv56')
conv_lobj_branch = self.convolutional(input_data, (3, 3, 512, 1024),
trainable=self.trainable, name='conv_lobj_branch')
conv_lbbox = self.convolutional(conv_lobj_branch, (1, 1, 1024, 3*(self.num_class + 5)),
trainable=self.trainable, name='conv_lbbox', activate=False, bn=False)
input_data = self.convolutional(input_data, (1, 1, 512, 256),
trainable=self.trainable, name = 'conv57')
input_data = self.upsample(input_data, name='upsample0', method=self.upsample_method)
with tf.variable_scope('route_1'):
input_data = tf.concat([input_data, route_2], axis=-1)
input_data = self.convolutional(input_data, (1, 1, 768, 256), trainable=self.trainable, name = 'conv58')
input_data = self.convolutional(input_data, (3, 3, 256, 512), trainable=self.trainable, name = 'conv59')
input_data = self.convolutional(input_data, (1, 1, 512, 256), trainable=self.trainable, name = 'conv60')
input_data = self.convolutional(input_data, (3, 3, 256, 512), trainable=self.trainable, name = 'conv61')
input_data = self.convolutional(input_data, (1, 1, 512, 256), trainable=self.trainable, name = 'conv62')
conv_mobj_branch = self.convolutional(input_data, (3, 3, 256, 512),
trainable=self.trainable, name='conv_mobj_branch' )
conv_mbbox = self.convolutional(conv_mobj_branch, (1, 1, 512, 3*(self.num_class + 5)),
trainable=self.trainable, name='conv_mbbox', activate=False, bn=False)
input_data = self.convolutional(input_data, (1, 1, 256, 128), trainable=self.trainable, name = 'conv63')
input_data = self.upsample(input_data, name='upsample1', method=self.upsample_method)
with tf.variable_scope('route_2'):
input_data = tf.concat([input_data, route_1], axis=-1)
input_data = self.convolutional(input_data, (1, 1, 384, 128), trainable=self.trainable, name = 'conv64')
input_data = self.convolutional(input_data, (3, 3, 128, 256), trainable=self.trainable, name = 'conv65')
input_data = self.convolutional(input_data, (1, 1, 256, 128), trainable=self.trainable, name = 'conv66')
input_data = self.convolutional(input_data, (3, 3, 128, 256), trainable=self.trainable, name = 'conv67')
input_data = self.convolutional(input_data, (1, 1, 256, 128), trainable=self.trainable, name = 'conv68')
conv_sobj_branch = self.convolutional(input_data, (3, 3, 128, 256), trainable=self.trainable, name='conv_sobj_branch')
conv_sbbox = self.convolutional(conv_sobj_branch, (1, 1, 256, 3*(self.num_class + 5)),
trainable=self.trainable, name='conv_sbbox', activate=False, bn=False)
return conv_lbbox,conv_mbbox,conv_sbbox
def decode(self, conv_output, anchors, stride):
conv_shape = tf.shape(conv_output)
batch_size = conv_shape[0]
output_size = conv_shape[1]
anchor_per_scale = len(anchors)
#输出为批数目 网格坐标x 网格坐标y 三个框信息3*(80+5)=255
conv_output = tf.reshape(conv_output, (batch_size, output_size, output_size, anchor_per_scale, 5 + self.num_class))
#批数目 网格坐标x 网格坐标y 该尺度预测框序号(共三个) 框信息5+80(坐标偏移量,高宽缩放,置信度,类别概率)
conv_raw_dxdy = conv_output[:, :, :, :, 0:2]
conv_raw_dwdh = conv_output[:, :, :, :, 2:4]
conv_raw_conf = conv_output[:, :, :, :, 4:5]
conv_raw_prob = conv_output[:, :, :, :, 5: ]
y = tf.tile(tf.range(output_size, dtype=tf.int32)[:, tf.newaxis], [1, output_size])
x = tf.tile(tf.range(output_size, dtype=tf.int32)[tf.newaxis, :], [output_size, 1])
xy_grid = tf.concat([x[:, :, tf.newaxis], y[:, :, tf.newaxis]], axis=-1)
xy_grid = tf.tile(xy_grid[tf.newaxis, :, :, tf.newaxis, :], [batch_size, 1, 1, anchor_per_scale, 1])
xy_grid = tf.cast(xy_grid, tf.float32)
pred_xy = (tf.sigmoid(conv_raw_dxdy) + xy_grid) * stride
pred_wh = (tf.exp(conv_raw_dwdh) * anchors) * stride
pred_xywh = tf.concat([pred_xy, pred_wh], axis=-1)
pred_conf = tf.sigmoid(conv_raw_conf)
pred_prob = tf.sigmoid(conv_raw_prob)
#输出为 批数目 网格坐标x 网格坐标y 该尺度预测框序号(共三个) 框信息5+80(坐标,高宽,置信度,类别概率)
return tf.concat([pred_xywh, pred_conf, pred_prob], axis=-1)
#对网络的输出进行处理 去除一些框 基于原图去除
def postprocess_boxes(pred_bbox, org_img_shape, input_size, score_threshold):
valid_scale=[0, np.inf]
pred_bbox = np.array(pred_bbox)
pred_xywh = pred_bbox[:, 0:4]
pred_conf = pred_bbox[:, 4]
pred_prob = pred_bbox[:, 5:]
# # (1) (x, y, w, h) --> (xmin, ymin, xmax, ymax)
pred_coor = np.concatenate([pred_xywh[:, :2] - pred_xywh[:, 2:] * 0.5,
pred_xywh[:, :2] + pred_xywh[:, 2:] * 0.5], axis=-1)
# # (2) (xmin, ymin, xmax, ymax) -> (xmin_org, ymin_org, xmax_org, ymax_org)
org_h, org_w = org_img_shape
resize_ratio = min(input_size / org_w, input_size / org_h)
dw = (input_size - resize_ratio * org_w) / 2
dh = (input_size - resize_ratio * org_h) / 2
pred_coor[:, 0::2] = 1.0 * (pred_coor[:, 0::2] - dw) / resize_ratio
pred_coor[:, 1::2] = 1.0 * (pred_coor[:, 1::2] - dh) / resize_ratio
# # (3) clip some boxes those are out of range
pred_coor = np.concatenate([np.maximum(pred_coor[:, :2], [0, 0]),
np.minimum(pred_coor[:, 2:], [org_w - 1, org_h - 1])], axis=-1)
invalid_mask = np.logical_or((pred_coor[:, 0] > pred_coor[:, 2]), (pred_coor[:, 1] > pred_coor[:, 3]))
pred_coor[invalid_mask] = 0
# # (4) discard some invalid boxes
bboxes_scale = np.sqrt(np.multiply.reduce(pred_coor[:, 2:4] - pred_coor[:, 0:2], axis=-1))
scale_mask = np.logical_and((valid_scale[0] < bboxes_scale), (bboxes_scale < valid_scale[1]))
# # (5) discard some boxes with low scores
classes = np.argmax(pred_prob, axis=-1)
scores = pred_conf * pred_prob[np.arange(len(pred_coor)), classes]
score_mask = scores > score_threshold
mask = np.logical_and(scale_mask, score_mask)
coors, scores, classes = pred_coor[mask], scores[mask], classes[mask]
return np.concatenate([coors, scores[:, np.newaxis], classes[:, np.newaxis]], axis=-1)
#iou计算用于nms
def bboxes_iou(boxes1, boxes2):
boxes1 = np.array(boxes1)
boxes2 = np.array(boxes2)
boxes1_area = (boxes1[..., 2] - boxes1[..., 0]) * (boxes1[..., 3] - boxes1[..., 1])
boxes2_area = (boxes2[..., 2] - boxes2[..., 0]) * (boxes2[..., 3] - boxes2[..., 1])
left_up = np.maximum(boxes1[..., :2], boxes2[..., :2])
right_down = np.minimum(boxes1[..., 2:], boxes2[..., 2:])
inter_section = np.maximum(right_down - left_up, 0.0)
inter_area = inter_section[..., 0] * inter_section[..., 1]
union_area = boxes1_area + boxes2_area - inter_area
ious = np.maximum(1.0 * inter_area / union_area, np.finfo(np.float32).eps)
return ious
#对输出框进行nms处理
def nms(bboxes, iou_threshold, sigma=0.3, method='nms'):
"""
:param bboxes: (xmin, ymin, xmax, ymax, score, class)
Note: soft-nms, https://arxiv.org/pdf/1704.04503.pdf
https://github.com/bharatsingh430/soft-nms
"""
classes_in_img = list(set(bboxes[:, 5]))
best_bboxes = []
for cls in classes_in_img:
cls_mask = (bboxes[:, 5] == cls)
cls_bboxes = bboxes[cls_mask]
while len(cls_bboxes) > 0:
max_ind = np.argmax(cls_bboxes[:, 4])
best_bbox = cls_bboxes[max_ind]
best_bboxes.append(best_bbox)
cls_bboxes = np.concatenate([cls_bboxes[: max_ind], cls_bboxes[max_ind + 1:]])
iou = bboxes_iou(best_bbox[np.newaxis, :4], cls_bboxes[:, :4])
weight = np.ones((len(iou),), dtype=np.float32)
assert method in ['nms', 'soft-nms']
if method == 'nms':
iou_mask = iou > iou_threshold
weight[iou_mask] = 0.0
if method == 'soft-nms':
weight = np.exp(-(1.0 * iou ** 2 / sigma))
cls_bboxes[:, 4] = cls_bboxes[:, 4] * weight
score_mask = cls_bboxes[:, 4] > 0.
cls_bboxes = cls_bboxes[score_mask]
return best_bboxes
#画框框
def draw_bbox(image, bboxes, classes=read_class_names('coco.names'), show_label=True):
"""
bboxes: [x_min, y_min, x_max, y_max, probability, cls_id] format coordinates.
"""
num_classes = len(classes)
image_h, image_w, _ = image.shape
hsv_tuples = [(1.0 * x / num_classes, 1., 1.) for x in range(num_classes)]
colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples))
colors = list(map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), colors))
random.seed(0)
random.shuffle(colors)
random.seed(None)
for i, bbox in enumerate(bboxes):
coor = np.array(bbox[:4], dtype=np.int32)
fontScale = 0.5
score = bbox[4]
class_ind = int(bbox[5])
bbox_color = colors[class_ind]
bbox_thick = int(0.6 * (image_h + image_w) / 600)
c1, c2 = (coor[0], coor[1]), (coor[2], coor[3])
cv2.rectangle(image, c1, c2, bbox_color, bbox_thick)
if show_label:
bbox_mess = '%s: %.2f' % (classes[class_ind], score)
t_size = cv2.getTextSize(bbox_mess, 0, fontScale, thickness=bbox_thick//2)[0]
cv2.rectangle(image, c1, (c1[0] + t_size[0], c1[1] - t_size[1] - 3), bbox_color, -1) # filled
cv2.putText(image, bbox_mess, (c1[0], c1[1]-2), cv2.FONT_HERSHEY_SIMPLEX,
fontScale, (0, 0, 0), bbox_thick//2, lineType=cv2.LINE_AA)
return image
#音视频分离
def open_audio_video(mp4_file_path, video_path, audio_path):
ff = FFmpeg(
inputs={mp4_file_path:None},
outputs={
audio_path:['-map', '0:0', '-c:a', 'copy', '-f', 'mp4'],
video_path:['-map', '0:1', '-c:a', 'copy', '-f', 'mp4']
}
)
ff.run()
#音视频合并
def close_audio_video(out_mp4_file_path, video_path, audio_path):
ff = FFmpeg(
inputs={
audio_path:None,
video_path:None
},
outputs={out_mp4_file_path:'-c:v h264 -c:a aac'}
)
ff.run()
if __name__ == '__main__':
img_video = False
with tf.Graph().as_default():
#构建计算图
input_data = tf.placeholder(dtype=tf.float32, name='input_data')
training = tf.placeholder(dtype=tf.bool, name='trainable')
network = YOLOV3(input_data,training)
saver = tf.train.Saver()
with tf.Session() as sess:
saver.restore(sess, './checkpoint/yolov3_coco_demo.ckpt')
if(img_video):
#读取图片
original_image = cv2.imread('road.jpeg')
#颜色格式处理
original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB)
original_image_size = original_image.shape[:2]
#图片预处理尺寸调整到416*416
image_data = image_preporcess(np.copy(original_image), [416, 416])
#创建批次维度
image_data = image_data[np.newaxis, ...]
#网络输出
pred_sbbox,pred_mbbox,pred_lbbox = \
sess.run([network.pred_sbbox,network.pred_mbbox,network.pred_lbbox],\
feed_dict={input_data:image_data,training:False})
#合并三尺度的预测框
pred_bbox = np.concatenate([np.reshape(pred_sbbox, (-1, 85)),
np.reshape(pred_mbbox, (-1, 85)),
np.reshape(pred_lbbox, (-1, 85))], axis=0)
#预测框后处理与NMS
bboxes = postprocess_boxes(pred_bbox, original_image_size, 416, 0.3)
bboxes = nms(bboxes, 0.45, method='nms')
#画在原始图片上
image = draw_bbox(original_image, bboxes)
image1 = Image.fromarray(image)
image1.show()
else:
video_file_path = 'road.mp4'
video_path = 'video.mp4'
audio_path = 'audio.mp4'
#音视频分离
open_audio_video(video_file_path, video_path, audio_path)
#VideoCapture()中参数是0,表示打开笔记本的内置摄像头,参数是视频文件路径则打开视频
vid = cv2.VideoCapture(audio_path)
fps = vid.get(cv2.CAP_PROP_FPS) #返回视频的fps--帧率
size=vid.get(cv2.CAP_PROP_FRAME_WIDTH) #返回视频的宽
size1=vid.get(cv2.CAP_PROP_FRAME_HEIGHT) #返回视频的高
#XVID
video2 = cv2.VideoWriter('PSY.avi', cv2.VideoWriter_fourcc('M', 'P', '4', '2'), int(fps), (int(size),int(size1))) #创建视频流对象
while True:
#vc.read()按帧读取视频,ret,frame是获cap.read()方法的两个返回值。
#其中ret是布尔值,如果读取帧是正确的则返回True,如果文件读取到结尾,它的返回值就为False。frame就是每一帧的图像,是个三维矩阵。
return_value, frame = vid.read()
if return_value:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
image = Image.fromarray(frame)
else:
raise ValueError("No image!")
frame_size = frame.shape[:2]
image_data = image_preporcess(np.copy(frame), [416, 416])
image_data = image_data[np.newaxis, ...]
pred_sbbox,pred_mbbox,pred_lbbox = \
sess.run([network.pred_sbbox,network.pred_mbbox,network.pred_lbbox],\
feed_dict={input_data:image_data,training:False})
pred_bbox = np.concatenate([np.reshape(pred_sbbox, (-1, 85)),
np.reshape(pred_mbbox, (-1, 85)),
np.reshape(pred_lbbox, (-1, 85))], axis=0)
bboxes = postprocess_boxes(pred_bbox, frame_size, 416, 0.3)
bboxes = nms(bboxes, 0.45, method='nms')
image = draw_bbox(frame, bboxes)
result = np.asarray(image)
cv2.namedWindow("result", cv2.WINDOW_AUTOSIZE)
result = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
cv2.imshow("result", result)
video2.write(result) # 向视频文件写入一帧--只有图像,没有声音
if cv2.waitKey(int(1000/fps)) & 0xFF == ord('q'): break#20ms读取下一帧图片
vid.release() # 释放视频流
cv2.destroyAllWindows() # 关闭所有窗口
#音视频合并 放到命令行执行下
close_audio_video('PSY_yolo_style.avi', video_path,'PSY.avi')
参考链接:
[1]https://zhuanlan.zhihu.com/p/79425557
[2]https://zhuanlan.zhihu.com/p/80056633
[3]https://zhuanlan.zhihu.com/p/80208709
[4]https://zhuanlan.zhihu.com/p/80600110
[5]https://github.com/YunYang1994/tensorflow-yolov3
[6]https://blog.csdn.net/leviopku/article/details/82660381
[7]https://www.bilibili.com/video/BV1i64y1u7Zj?p=2