yolov3(tensorflow1)的代码阅读

li_jixing_0425

于 2020-07-23 17:10:26 发布

阅读量1.4k

点赞数 2

分类专栏： tensorflow1 文章标签： tensorflow 深度学习神经网络

本文链接：https://blog.csdn.net/lijixing_another/article/details/107541831

版权

tensorflow1 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

本文主要通过啃https://github.com/YunYang1994/tensorflow-yolov3代码来学习了下yolov3，此外从B站up主https://www.bilibili.com/video/BV1i64y1u7Zj?p=2学习了目标检测的基础知识，例如NMS,IOU,YOLO发展等。

一、网络结构，前向计算过程

上述结构图源自参考链接[6]，个人总结以下特点：Darknet-53主要为残差网络结构；DBL网络块中利用卷积代替池化进行2倍下采样；3尺度输出，每一种尺度对应不同大小物体的检测；网络的输出维度与图片的输入维度无关。详细的网络结构可以参照参考链接[5]，摘录代码如下：

基本网络块：DBL、RES、CONCAT、UPsample

def convolutional(input_data, filters_shape, trainable, name, downsample=False, activate=True, bn=True):
    with tf.variable_scope(name):
        if downsample:
            pad_h, pad_w = (filters_shape[0] - 2) // 2 + 1, (filters_shape[1] - 2) // 2 + 1
            paddings = tf.constant([[0, 0], [pad_h, pad_h], [pad_w, pad_w], [0, 0]])
            input_data = tf.pad(input_data, paddings, 'CONSTANT')
            strides = (1, 2, 2, 1)
            padding = 'VALID'
        else:
            strides = (1, 1, 1, 1)
            padding = "SAME"
        #初始化卷积核权值
        weight = tf.get_variable(name='weight', dtype=tf.float32, trainable=True,
                                 shape=filters_shape, initializer=tf.random_normal_initializer(stddev=0.01))
        conv = tf.nn.conv2d(input=input_data, filter=weight, strides=strides, padding=padding)
        if bn:
            conv = tf.layers.batch_normalization(conv, beta_initializer=tf.zeros_initializer(),
                                                 gamma_initializer=tf.ones_initializer(),
                                                 moving_mean_initializer=tf.zeros_initializer(),
                                                 moving_variance_initializer=tf.ones_initializer(), training=trainable)
        else:
            #初始化卷积核偏置
            bias = tf.get_variable(name='bias', shape=filters_shape[-1], trainable=True,
                                   dtype=tf.float32, initializer=tf.constant_initializer(0.0))
            conv = tf.nn.bias_add(conv, bias)

        if activate == True: conv = tf.nn.leaky_relu(conv, alpha=0.1)

    return conv
#残差块  输入与经过1*1卷积与3*3卷积后的输出相加 最终input_data short_cut尺寸一样
def residual_block(input_data, input_channel, filter_num1, filter_num2, trainable, name):

    short_cut = input_data

    with tf.variable_scope(name):
        input_data = convolutional(input_data, filters_shape=(1, 1, input_channel, filter_num1),
                                   trainable=trainable, name='conv1')
        input_data = convolutional(input_data, filters_shape=(3, 3, filter_num1,   filter_num2),
                                   trainable=trainable, name='conv2')

        residual_output = input_data + short_cut

    return residual_output
#图层叠加
def route(name, previous_output, current_output):
    with tf.variable_scope(name):
        output = tf.concat([current_output, previous_output], axis=-1)
    return output

#上采样 resize和反卷积两种方式
def upsample(input_data, name, method="deconv"):
    assert method in ["resize", "deconv"]
    if method == "resize":
        with tf.variable_scope(name):
            input_shape = tf.shape(input_data)
            output = tf.image.resize_nearest_neighbor(input_data, (input_shape[1] * 2, input_shape[2] * 2))
    if method == "deconv":
        numm_filter = input_data.shape.as_list()[-1]
        output = tf.layers.conv2d_transpose(input_data, numm_filter, kernel_size=2, padding='same',
                                            strides=(2,2), kernel_initializer=tf.random_normal_initializer())
    return output

Darknet53结构

#假设输入图片为416*416*3
def darknet53(input_data, trainable):

    with tf.variable_scope('darknet'):
        #3*3*3*32卷积后 得到416*416*32
        input_data = common.convolutional(input_data, filters_shape=(3, 3,  3,  32), trainable=trainable, name='conv0')
        #3*3*32*64卷积后 得到208*208*64
        input_data = common.convolutional(input_data, filters_shape=(3, 3, 32,  64),
                                          trainable=trainable, name='conv1', downsample=True)
        #一个残差块 经过1*1*64*32的卷积和3*3*32*64的卷积 得到208*208*64 然后相加 结果为208*208*64
        for i in range(1):
            input_data = common.residual_block(input_data,  64,  32, 64, trainable=trainable, name='residual%d' %(i+0))
        #3*3*64*128卷积后 得到104*104*128
        input_data = common.convolutional(input_data, filters_shape=(3, 3,  64, 128),
                                          trainable=trainable, name='conv4', downsample=True)
        #两个残差块 经过1*1*128*64的卷积和3*3*64*128的卷积 得到104*104*128 然后相加 结果为104*104*128
        for i in range(2):
            input_data = common.residual_block(input_data, 128,  64, 128, trainable=trainable, name='residual%d' %(i+1))
        #3*3*128*256卷积后 得到52*52*256
        input_data = common.convolutional(input_data, filters_shape=(3, 3, 128, 256),
                                          trainable=trainable, name='conv9', downsample=True)
        #八个残差块 经过1*1*256*128的卷积和3*3*128*256的卷积 得到52*52*256
        for i in range(8):
            input_data = common.residual_block(input_data, 256, 128, 256, trainable=trainable, name='residual%d' %(i+3))

        route_1 = input_data
        #3*3*256*512的卷积后 得到26*26*512的图像
        input_data = common.convolutional(input_data, filters_shape=(3, 3, 256, 512),
                                          trainable=trainable, name='conv26', downsample=True)
        #八个残差块 经过1*1*512*256的卷积和3*3*256*512的卷积 得到26*26*512
        for i in range(8):
            input_data = common.residual_block(input_data, 512, 256, 512, trainable=trainable, name='residual%d' %(i+11))

        route_2 = input_data
        #3*3*512*1024的卷积后 得到13*13*1024的图像
        input_data = common.convolutional(input_data, filters_shape=(3, 3, 512, 1024),
                                          trainable=trainable, name='conv43', downsample=True)
        #四个残差块 经过1*1*1024*512的卷积和3*3*512*1024的卷积 得到13*13*1024
        for i in range(4):
            input_data = common.residual_block(input_data, 1024, 512, 1024, trainable=trainable, name='residual%d' %(i+19))
        #route_1 52*52*256, route_2 26*26*512, input_data 13*13*1024
        return route_1, route_2, input_data

YOLOV3结构

try:
    self.conv_lbbox, self.conv_mbbox, self.conv_sbbox = self.__build_nework(input_data)
except:
    raise NotImplementedError("Can not build up yolov3 network!")

with tf.variable_scope('pred_sbbox'):
    self.pred_sbbox = self.decode(self.conv_sbbox, self.anchors[0], self.strides[0])

with tf.variable_scope('pred_mbbox'):
    self.pred_mbbox = self.decode(self.conv_mbbox, self.anchors[1], self.strides[1])

with tf.variable_scope('pred_lbbox'):
    self.pred_lbbox = self.decode(self.conv_lbbox, self.anchors[2], self.strides[2])

def __build_nework(self, input_data):
    #route_1 52*52*256, route_2 26*26*512, input_data 13*13*1024
    route_1, route_2, input_data = backbone.darknet53(input_data, self.trainable)
    #input_data 13*13*512
    input_data = common.convolutional(input_data, (1, 1, 1024,  512), self.trainable, 'conv52')
    #input_data 13*13*1024
    input_data = common.convolutional(input_data, (3, 3,  512, 1024), self.trainable, 'conv53')
    #input_data 13*13*512
    input_data = common.convolutional(input_data, (1, 1, 1024,  512), self.trainable, 'conv54')
    #input_data 13*13*1024
    input_data = common.convolutional(input_data, (3, 3,  512, 1024), self.trainable, 'conv55')
    #input_data 13*13*512
    input_data = common.convolutional(input_data, (1, 1, 1024,  512), self.trainable, 'conv56')
    #conv_lobj_branch 13*13*1024
    conv_lobj_branch = common.convolutional(input_data, (3, 3, 512, 1024), self.trainable, name='conv_lobj_branch')
    #conv_lbbox 13*13*256
    conv_lbbox = common.convolutional(conv_lobj_branch, (1, 1, 1024, 3*(self.num_class + 5)),
                                      trainable=self.trainable, name='conv_lbbox', activate=False, bn=False)
    #input_data 13*13*256
    input_data = common.convolutional(input_data, (1, 1,  512,  256), self.trainable, 'conv57')
    #input_data 26*26*256
    input_data = common.upsample(input_data, name='upsample0', method=self.upsample_method)
    
    #input-data route2最内层拼接 26*26*（512+256）
    with tf.variable_scope('route_1'):
        input_data = tf.concat([input_data, route_2], axis=-1)
    #input_data 26*26*256
    input_data = common.convolutional(input_data, (1, 1, 768, 256), self.trainable, 'conv58')
    #input_data 26*26*512
    input_data = common.convolutional(input_data, (3, 3, 256, 512), self.trainable, 'conv59')
    #input_data 26*26*256
    input_data = common.convolutional(input_data, (1, 1, 512, 256), self.trainable, 'conv60')
    #input_data 26*26*512
    input_data = common.convolutional(input_data, (3, 3, 256, 512), self.trainable, 'conv61')
    #input_data 26*26*256
    input_data = common.convolutional(input_data, (1, 1, 512, 256), self.trainable, 'conv62')
    #conv_mobj_branch 26*26*512
    conv_mobj_branch = common.convolutional(input_data, (3, 3, 256, 512),  self.trainable, name='conv_mobj_branch' )
    #conv_mbbox 26*26*256
    conv_mbbox = common.convolutional(conv_mobj_branch, (1, 1, 512, 3*(self.num_class + 5)),
                                      trainable=self.trainable, name='conv_mbbox', activate=False, bn=False)
    #input_data 26*26*128
    input_data = common.convolutional(input_data, (1, 1, 256, 128), self.trainable, 'conv63')
    #input_data 52*52*128
    input_data = common.upsample(input_data, name='upsample1', method=self.upsample_method)

    with tf.variable_scope('route_2'):
        #input_data 26*26*(256+128)
        input_data = tf.concat([input_data, route_1], axis=-1)

    input_data = common.convolutional(input_data, (1, 1, 384, 128), self.trainable, 'conv64')
    input_data = common.convolutional(input_data, (3, 3, 128, 256), self.trainable, 'conv65')
    input_data = common.convolutional(input_data, (1, 1, 256, 128), self.trainable, 'conv66')
    input_data = common.convolutional(input_data, (3, 3, 128, 256), self.trainable, 'conv67')
    input_data = common.convolutional(input_data, (1, 1, 256, 128), self.trainable, 'conv68')

    conv_sobj_branch = common.convolutional(input_data, (3, 3, 128, 256), self.trainable, name='conv_sobj_branch')
    conv_sbbox = common.convolutional(conv_sobj_branch, (1, 1, 256, 3*(self.num_class + 5)),
                                      trainable=self.trainable, name='conv_sbbox', activate=False, bn=False)
    #conv_lbbox, conv_mbbox, conv_sbbox 13*13*256 26*26*256 52*52*256
    return conv_lbbox, conv_mbbox, conv_sbbox

从网络整体来看，相当于将输入图片划分为13*13,26*26,52*52的区域，每一个区域经过网络预测3个框，对应255个输出[(tx,ty,tw,th,tc,prob*80),(tx,ty,tw,th,tc,prob*80),(tx,ty,tw,th,tc,prob*80)]，注意这个输出（即conv_lbbox, conv_mbbox, conv_sbbox）并不是边框实际位置，网络输出还需要经过直接位置预测，得到原始图片尺寸的边框信息(x,y,w,h,conf,class)

摘录代码如下：conv_output为网络输出（如13*13*255），stride为下采样计量

    def decode(self, conv_output, anchors, stride):
        conv_shape       = tf.shape(conv_output)
        batch_size       = conv_shape[0]
        output_size      = conv_shape[1]
        anchor_per_scale = len(anchors)

        conv_output = tf.reshape(conv_output, (batch_size, output_size, output_size, anchor_per_scale, 5 + self.num_class))
        #网络输出坐标
        conv_raw_dxdy = conv_output[:, :, :, :, 0:2]
        #网络输出高宽
        conv_raw_dwdh = conv_output[:, :, :, :, 2:4]
        #网络输出置信度分数
        conv_raw_conf = conv_output[:, :, :, :, 4:5]
        conv_raw_prob = conv_output[:, :, :, :, 5: ]

        y = tf.tile(tf.range(output_size, dtype=tf.int32)[:, tf.newaxis], [1, output_size])
        x = tf.tile(tf.range(output_size, dtype=tf.int32)[tf.newaxis, :], [output_size, 1])

        xy_grid = tf.concat([x[:, :, tf.newaxis], y[:, :, tf.newaxis]], axis=-1)
        xy_grid = tf.tile(xy_grid[tf.newaxis, :, :, tf.newaxis, :], [batch_size, 1, 1, anchor_per_scale, 1])
        xy_grid = tf.cast(xy_grid, tf.float32)
        # stride 分别对应三种网格尺度 将网络的输出转为原始图片上的位置信息
        pred_xy = (tf.sigmoid(conv_raw_dxdy) + xy_grid) * stride#每一个格点位置加上预测偏移 乘以stride映射到原始尺寸
        pred_wh = (tf.exp(conv_raw_dwdh) * anchors) * stride#预测缩放系数乘以先验框 乘以stride映射到原始尺寸
        pred_xywh = tf.concat([pred_xy, pred_wh], axis=-1)
        #sigmoid的置信度
        pred_conf = tf.sigmoid(conv_raw_conf)
        pred_prob = tf.sigmoid(conv_raw_prob)

        return tf.concat([pred_xywh, pred_conf, pred_prob], axis=-1)

直接位置预测之后，一张图片会计算出很多个预测框，对这些预测框进行后处理与NMS，得到最终的检测结果。后处理代码代码摘录如下：pred_bbox为最终预测框，org_img_shape输入图片原始大小，input_size为网络输入图片大小，score_threshold为分数阈值

def postprocess_boxes(pred_bbox, org_img_shape, input_size, score_threshold):

    valid_scale=[0, np.inf]
    pred_bbox = np.array(pred_bbox)

    pred_xywh = pred_bbox[:, 0:4]
    pred_conf = pred_bbox[:, 4]
    pred_prob = pred_bbox[:, 5:]

    # # (1) 坐标转换(x, y, w, h) --> (xmin, ymin, xmax, ymax)
    pred_coor = np.concatenate([pred_xywh[:, :2] - pred_xywh[:, 2:] * 0.5,
                                pred_xywh[:, :2] + pred_xywh[:, 2:] * 0.5], axis=-1)
    # # (2) 转换为原图坐标(xmin, ymin, xmax, ymax) -> (xmin_org, ymin_org, xmax_org, ymax_org)
    org_h, org_w = org_img_shape
    resize_ratio = min(input_size / org_w, input_size / org_h)

    dw = (input_size - resize_ratio * org_w) / 2
    dh = (input_size - resize_ratio * org_h) / 2

    pred_coor[:, 0::2] = 1.0 * (pred_coor[:, 0::2] - dw) / resize_ratio
    pred_coor[:, 1::2] = 1.0 * (pred_coor[:, 1::2] - dh) / resize_ratio

    # # (3) 清除掉超出范围的边框
    pred_coor = np.concatenate([np.maximum(pred_coor[:, :2], [0, 0]),
                                np.minimum(pred_coor[:, 2:], [org_w - 1, org_h - 1])], axis=-1)
    invalid_mask = np.logical_or((pred_coor[:, 0] > pred_coor[:, 2]), (pred_coor[:, 1] > pred_coor[:, 3]))
    pred_coor[invalid_mask] = 0

    # # (4) 清除掉无效的边框 右下点比左上小
    bboxes_scale = np.sqrt(np.multiply.reduce(pred_coor[:, 2:4] - pred_coor[:, 0:2], axis=-1))
    scale_mask = np.logical_and((valid_scale[0] < bboxes_scale), (bboxes_scale < valid_scale[1]))

    # # (5) 清除评分低的边框
    classes = np.argmax(pred_prob, axis=-1)
    scores = pred_conf * pred_prob[np.arange(len(pred_coor)), classes]
    score_mask = scores > score_threshold
    mask = np.logical_and(scale_mask, score_mask)
    coors, scores, classes = pred_coor[mask], scores[mask], classes[mask]

    return np.concatenate([coors, scores[:, np.newaxis], classes[:, np.newaxis]], axis=-1)

NMS代码摘录如下

#对每个类别的预测框进行box nms
def nms(bboxes, iou_threshold, sigma=0.3, method='nms'):
    """
    :param bboxes: (xmin, ymin, xmax, ymax, score, class)

    Note: soft-nms, https://arxiv.org/pdf/1704.04503.pdf
          https://github.com/bharatsingh430/soft-nms
    """
    classes_in_img = list(set(bboxes[:, 5]))
    best_bboxes = []

    for cls in classes_in_img:
        cls_mask = (bboxes[:, 5] == cls)
        cls_bboxes = bboxes[cls_mask]

        while len(cls_bboxes) > 0:
            max_ind = np.argmax(cls_bboxes[:, 4])
            best_bbox = cls_bboxes[max_ind]
            best_bboxes.append(best_bbox)
            cls_bboxes = np.concatenate([cls_bboxes[: max_ind], cls_bboxes[max_ind + 1:]])
            iou = bboxes_iou(best_bbox[np.newaxis, :4], cls_bboxes[:, :4])
            weight = np.ones((len(iou),), dtype=np.float32)
            assert method in ['nms', 'soft-nms']
            if method == 'nms':
                iou_mask = iou > iou_threshold
                weight[iou_mask] = 0.0
            if method == 'soft-nms':
                weight = np.exp(-(1.0 * iou ** 2 / sigma))
            cls_bboxes[:, 4] = cls_bboxes[:, 4] * weight
            score_mask = cls_bboxes[:, 4] > 0.
            cls_bboxes = cls_bboxes[score_mask]
    return best_bboxes

二、数据准备

以voc数据集为例，voc数据集的图片信息以xml文件存储，第一步应解析xml文件获得数据集信息，每张图片的信息表示为['image path','xmin','ymin','xmax','ymax','classoid',...]，最终得到voc_train.txt与voc_test.txt代码如下：

# -*- coding: utf-8 -*-

#xml文件读取可参考https://www.w3school.com.cn/xmldom/dom_nodes_info.asp 

import xml.dom.minidom
import os
import parameter


testfile_path = "VOC2007_TEST/Annotations"
#列出文件夹下所有文件 保存至filelist列表
filelist = os.listdir(testfile_path)
with open("voc_test.txt",'w') as file_handle:
    #先清空文件
    file_handle.seek(0)
    file_handle.truncate()   
    for xmlfile in filelist:
        # 使用minidom解析器打开 XML 文档
        DOMTree = xml.dom.minidom.parse(os.path.join(testfile_path,xmlfile))
        collection = DOMTree.documentElement
        #获取图片路径并写入 
        img_path = os.path.join("VOC2007_TEST/JPEGImages/",
        collection.getElementsByTagName("filename")[0].childNodes[0].data)
        file_handle.write(img_path)
        file_handle.write(' ')
        #获取图片中所有目标对象
        objects = collection.getElementsByTagName("object")
        #获取每个目标对象的类别与位置信息
        for object_id in objects:
           xmin = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('xmin')[0]\
               .childNodes[0].data
           file_handle.write(xmin)
           file_handle.write(',')
           ymin = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('ymin')[0]\
               .childNodes[0].data
           file_handle.write(ymin)
           file_handle.write(',')
           xmax = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('xmax')[0]\
               .childNodes[0].data
           file_handle.write(xmax)
           file_handle.write(',')
           ymax = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('ymax')[0]\
               .childNodes[0].data
           file_handle.write(ymax)
           file_handle.write(',')
           name = object_id.getElementsByTagName('name')[0].childNodes[0].data
           class_id = parameter.voc_name.index(name)
           file_handle.write(str(class_id))
           file_handle.write(' ')
        file_handle.write('\n')
    print('VOC2007_TEST Read end!')

trainfile_path1 = "VOC2007_TRAIN/Annotations"
#列出文件夹下所有文件 保存至filelist列表
filelist = os.listdir(trainfile_path1)
with open("voc_train.txt",'w') as file_handle:
    #先清空文件
    file_handle.seek(0)
    file_handle.truncate()   
    for xmlfile in filelist:
        # 使用minidom解析器打开 XML 文档
        DOMTree = xml.dom.minidom.parse(os.path.join(trainfile_path1,xmlfile))
        collection = DOMTree.documentElement
        #获取图片路径并写入 
        img_path = os.path.join("VOC2007_TRAIN/JPEGImages/",
        collection.getElementsByTagName("filename")[0].childNodes[0].data)
        file_handle.write(img_path)
        file_handle.write(' ')
        #获取图片中所有目标对象
        objects = collection.getElementsByTagName("object")
        #获取每个目标对象的类别与位置信息
        for object_id in objects:
           xmin = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('xmin')[0]\
               .childNodes[0].data
           file_handle.write(xmin)
           file_handle.write(',')
           ymin = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('ymin')[0]\
               .childNodes[0].data
           file_handle.write(ymin)
           file_handle.write(',')
           xmax = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('xmax')[0]\
               .childNodes[0].data
           file_handle.write(xmax)
           file_handle.write(',')
           ymax = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('ymax')[0]\
               .childNodes[0].data
           file_handle.write(ymax)
           file_handle.write(',')
           name = object_id.getElementsByTagName('name')[0].childNodes[0].data
           class_id = parameter.voc_name.index(name)
           file_handle.write(str(class_id))
           file_handle.write(' ')
        file_handle.write('\n')
    print('VOC2007_TRAIN Read end!')

trainfile_path2 = "VOC2012_TRAIN/Annotations"
#列出文件夹下所有文件 保存至filelist列表
filelist = os.listdir(trainfile_path2)
#注意这里是文件追加
with open("voc_train.txt",'a') as file_handle:
    for xmlfile in filelist:
        # 使用minidom解析器打开 XML 文档
        DOMTree = xml.dom.minidom.parse(os.path.join(trainfile_path2,xmlfile))
        collection = DOMTree.documentElement
        #获取图片路径并写入 
        img_path = os.path.join("VOC2012_TRAIN/JPEGImages/",
        collection.getElementsByTagName("filename")[0].childNodes[0].data)
        file_handle.write(img_path)
        file_handle.write(' ')
        #获取图片中所有目标对象
        objects = collection.getElementsByTagName("object")
        #获取每个目标对象的类别与位置信息
        for object_id in objects:
           xmin = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('xmin')[0]\
               .childNodes[0].data
           file_handle.write(xmin)
           file_handle.write(',')
           ymin = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('ymin')[0]\
               .childNodes[0].data
           file_handle.write(ymin)
           file_handle.write(',')
           xmax = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('xmax')[0]\
               .childNodes[0].data
           file_handle.write(xmax)
           file_handle.write(',')
           ymax = object_id.getElementsByTagName('bndbox')[0].getElementsByTagName('ymax')[0]\
               .childNodes[0].data
           file_handle.write(ymax)
           file_handle.write(',')
           name = object_id.getElementsByTagName('name')[0].childNodes[0].data
           class_id = parameter.voc_name.index(name)
           file_handle.write(str(class_id))
           file_handle.write(' ')
        file_handle.write('\n')
    print('VOC2012_TRAIN Read end!')

接下来可以加载图片信息，进行相关处理，得到图片的对应标签。处理流程如下：

1、加载所有图片信息

#加载voc_train.txt或者voc_test.txt
def load_annotations(data_path):
    with open(data_path, 'r') as f:
        #列表形式读取来所有行
        data = f.readlines()
        #对每一行去除首尾空白符 并去除没有真实框的数据
        annotations = [line.strip() for line in data if len(line.strip().split()[1:]) != 0]
    #对数据进行打乱
    np.random.shuffle(annotations)
    return annotations

2、解析图片信息得到图片数据与边框数据，并对图片和边框进行数据增强和尺寸统一（原始图片尺寸不一致）

#数据增强 随机水平翻转
def random_horizontal_flip(image, bboxes):
    if random.random() < 0.5:
        _, w, _ = image.shape
        # HWC格式 H不变 W坐标取反
        image = image[:, ::-1, :]
        # xmin xmax对应bboxes[:, [2,0]]
        bboxes[:, [0,2]] = w - bboxes[:, [2,0]]
    return image, bboxes

#数据增强 随机裁剪
def random_crop(image, bboxes):
    if random.random() < 0.5:
        h, w, _ = image.shape
        #最小的xmin ymin 与最大的xmax ymax拼接为最大的max_bbox
        max_bbox = np.concatenate([np.min(bboxes[:, 0:2], axis=0), np.max(bboxes[:, 2:4], axis=0)], axis=-1)
        
        max_l_trans = max_bbox[0]
        max_u_trans = max_bbox[1]
        max_r_trans = w - max_bbox[2]
        max_d_trans = h - max_bbox[3]
        #截取图片的范围是包含了max_bbox
        crop_xmin = max(0, int(max_bbox[0] - random.uniform(0, max_l_trans)))
        crop_ymin = max(0, int(max_bbox[1] - random.uniform(0, max_u_trans)))
        crop_xmax = max(w, int(max_bbox[2] + random.uniform(0, max_r_trans)))
        crop_ymax = max(h, int(max_bbox[3] + random.uniform(0, max_d_trans)))
        image = image[crop_ymin : crop_ymax, crop_xmin : crop_xmax]
        #图片截取之后 相应的box坐标也需要做出调整 因为图像的原点变了
        bboxes[:, [0, 2]] = bboxes[:, [0, 2]] - crop_xmin
        bboxes[:, [1, 3]] = bboxes[:, [1, 3]] - crop_ymin
    return image, bboxes

#数据增强 随机移动
def random_translate(image, bboxes):

    if random.random() < 0.5:
        h, w, _ = image.shape
        max_bbox = np.concatenate([np.min(bboxes[:, 0:2], axis=0), np.max(bboxes[:, 2:4], axis=0)], axis=-1)

        max_l_trans = max_bbox[0]
        max_u_trans = max_bbox[1]
        max_r_trans = w - max_bbox[2]
        max_d_trans = h - max_bbox[3]

        tx = random.uniform(-(max_l_trans - 1), (max_r_trans - 1))
        ty = random.uniform(-(max_u_trans - 1), (max_d_trans - 1))

        M = np.array([[1, 0, tx], [0, 1, ty]])
        image = cv2.warpAffine(image, M, (w, h))

        bboxes[:, [0, 2]] = bboxes[:, [0, 2]] + tx
        bboxes[:, [1, 3]] = bboxes[:, [1, 3]] + ty

    return image, bboxes

#图像与边框以最小比例缩放 并以128填充到目标尺寸 图片像素为HWC格式
def image_preporcess(image, target_size, gt_boxes=None):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
    ih, iw    = target_size
    h,  w, _  = image.shape
    scale = min(iw/w, ih/h)
    nw, nh  = int(scale * w), int(scale * h)
    image_resized = cv2.resize(image, (nw, nh))
    image_paded = np.full(shape=[ih, iw, 3], fill_value=128.0)
    dw, dh = (iw - nw) // 2, (ih-nh) // 2
    image_paded[dh:nh+dh, dw:nw+dw, :] = image_resized
    image_paded = image_paded / 255.
    if gt_boxes is None:
        return image_paded
    else:
        gt_boxes[:, [0, 2]] = gt_boxes[:, [0, 2]] * scale + dw
        gt_boxes[:, [1, 3]] = gt_boxes[:, [1, 3]] * scale + dh
        return image_paded, gt_boxes
    
#根据解析的路径信息得到缩放后的图像数据和边框数据，此时边框数据为(xmin,ymin,xmax,ymax,class)
def parse_annotation(annotation,data_aug,train_input_size):

    line = annotation.split()
    image_path = line[0]
    if not os.path.exists(image_path):
        raise KeyError("%s does not exist ... " %image_path)
    image = np.array(cv2.imread(image_path))
    #对图像的边框信息转化为int类型，并存在二维数组里
    bboxes = np.array([list(map(lambda x: int(float(x)), box.split(','))) for box in line[1:]])
    #是否进行数据增强
    if(data_aug):
        image, bboxes = random_horizontal_flip(np.copy(image), np.copy(bboxes))
        image, bboxes = random_crop(np.copy(image), np.copy(bboxes))
        image, bboxes = random_translate(np.copy(image), np.copy(bboxes))
        
    image, bboxes = image_preporcess(np.copy(image), 
                                     [train_input_size, train_input_size], np.copy(bboxes))
    return image, bboxes

3、对图片的边框数据进行分析，得到三种网格划分情况下的标签，以及真实边框位置信息

#计算两个框的iou
def bbox_iou(boxes1, boxes2):
        boxes1 = np.array(boxes1)
        boxes2 = np.array(boxes2)
        #两个边框的面积
        boxes1_area = boxes1[..., 2] * boxes1[..., 3] 
        boxes2_area = boxes2[..., 2] * boxes2[..., 3]
        #转换为(xmin,ymin,xmax,ymax)
        boxes1 = np.concatenate([boxes1[..., :2] - boxes1[..., 2:] * 0.5,
                                boxes1[..., :2] + boxes1[..., 2:] * 0.5], axis=-1)
        boxes2 = np.concatenate([boxes2[..., :2] - boxes2[..., 2:] * 0.5,
                                boxes2[..., :2] + boxes2[..., 2:] * 0.5], axis=-1)
        #求相交区域内的顶点
        left_up = np.maximum(boxes1[..., :2], boxes2[..., :2])
        right_down = np.minimum(boxes1[..., 2:], boxes2[..., 2:])
        #计算面积
        inter_section = np.maximum(right_down - left_up, 0.0)
        inter_area = inter_section[..., 0] * inter_section[..., 1]
        union_area = boxes1_area + boxes2_area - inter_area
        return inter_area / union_area
    
def preprocess_true_boxes(bboxes,train_output_sizes):
    #初始化3种网格尺寸下的边框label列表 列表元素为13*13*3*25 26*26*3*25 52*52*3*25
    label = [np.zeros((train_output_sizes[i], train_output_sizes[i], parameter.anchor_per_scale,
                       5 + parameter.num_classes)) for i in range(3)]
    #每种网格尺寸下最多有150个边框，初始化3种网格尺寸下的边框位置信息 其实3种网格尺寸下边框信息都是一样的,如果满足iou>0.3的条件的话
    bboxes_xywh = [np.zeros((parameter.max_bbox_per_scale, 4)) for _ in range(3)]
    bbox_count = np.zeros((3,))

    #对于每个边框进行处理
    for bbox in bboxes:
        bbox_coor = bbox[:4]
        bbox_class_ind = bbox[4]
        #按照边框类别，进行平滑独热编码
        onehot = np.zeros(parameter.num_classes, dtype=np.float)
        onehot[bbox_class_ind] = 1.0
        uniform_distribution = np.full(parameter.num_classes, 1.0 / parameter.num_classes)
        deta = 0.01
        smooth_onehot = onehot * (1 - deta) + deta * uniform_distribution
        #将边框位置信息转换为(x,y,w,h)，即中心点和宽高
        bbox_xywh = np.concatenate([(bbox_coor[2:] + bbox_coor[:2]) * 0.5, bbox_coor[2:] - bbox_coor[:2]], axis=-1)
        # 按8，16，32下采样比例对中心点以及宽高进行缩放,shape = (3, 4) 3个对应下采样后的坐标值
        bbox_xywh_scaled = 1.0 * bbox_xywh[np.newaxis, :] / parameter.strides[:, np.newaxis]

        iou = []
        exist_positive = False
        #对于每种网格尺寸
        for i in range(3):
            anchors_xywh = np.zeros((parameter.anchor_per_scale, 4))
            #中心点用的真实框的 宽高使用先验框的 每个尺度下3个框
            anchors_xywh[:, 0:2] = np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32) + 0.5
            anchors_xywh[:, 2:4] = parameter.anchors[i]
            #计算先验框和真实框的iou，一个尺度下 一个真实框 对应3个先验框 iou_scale.shape =(,3)
            iou_scale = bbox_iou(bbox_xywh_scaled[i][np.newaxis, :], anchors_xywh)
            iou.append(iou_scale)
            iou_mask = iou_scale > 0.3

            if np.any(iou_mask):
                #将边框中心点位置平移到以网格为基准的位置
                xind, yind = np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32)
                #该网格尺度下 对目标网格进行赋值 iou_mask为false的不进行赋值
                label[i][yind, xind, iou_mask, :] = 0
                label[i][yind, xind, iou_mask, 0:4] = bbox_xywh
                label[i][yind, xind, iou_mask, 4:5] = 1.0
                label[i][yind, xind, iou_mask, 5:] = smooth_onehot

                bbox_ind = int(bbox_count[i] % parameter.max_bbox_per_scale)
                bboxes_xywh[i][bbox_ind, :4] = bbox_xywh
                bbox_count[i] += 1

                exist_positive = True
        #如果不存在iou>0.3的先验框 就对所有的先验框排序 选择iou最大的那个
        if not exist_positive:
            best_anchor_ind = np.argmax(np.array(iou).reshape(-1), axis=-1)
            #对应哪个尺度
            best_detect = int(best_anchor_ind / parameter.anchor_per_scale)
            #对应尺度下的第几个框
            best_anchor = int(best_anchor_ind % parameter.anchor_per_scale)
            xind, yind = np.floor(bbox_xywh_scaled[best_detect, 0:2]).astype(np.int32)

            label[best_detect][yind, xind, best_anchor, :] = 0
            label[best_detect][yind, xind, best_anchor, 0:4] = bbox_xywh
            label[best_detect][yind, xind, best_anchor, 4:5] = 1.0
            label[best_detect][yind, xind, best_anchor, 5:] = smooth_onehot

            bbox_ind = int(bbox_count[best_detect] % parameter.max_bbox_per_scale)
            bboxes_xywh[best_detect][bbox_ind, :4] = bbox_xywh
            bbox_count[best_detect] += 1
    label_sbbox, label_mbbox, label_lbbox = label
    sbboxes, mbboxes, lbboxes = bboxes_xywh
    #返回三种网格尺寸下的标签(x,y,w,h,conf,class)以及真实边框位置(x,y,w,h)
    return label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes

4、将输入数据与输出标签做成批次，以便进行训练

def __next__(annotations,batch_size):
    with tf.device('/cpu:0'):
        train_input_size = random.choice(parameter.train_input_size)
        train_output_sizes = train_input_size // parameter.strides
        #输入图像 NHWC
        batch_image = np.zeros((batch_size, train_input_size, train_input_size, 3))
        #对应三种网格划分下的输出batch_size*13*13*3*85 batch_size*26*26*3*85
        batch_label_sbbox = np.zeros((batch_size, train_output_sizes[0], train_output_sizes[0],
                                      parameter.anchor_per_scale, 5 + parameter.num_classes))
        batch_label_mbbox = np.zeros((batch_size, train_output_sizes[1], train_output_sizes[1],
                                      parameter.anchor_per_scale, 5 + parameter.num_classes))
        batch_label_lbbox = np.zeros((batch_size, train_output_sizes[2], train_output_sizes[2],
                                      parameter.anchor_per_scale, 5 + parameter.num_classes))
        #对应3种网格划分下真实框的信息，真实框最多不超过150个
        batch_sbboxes = np.zeros((batch_size, parameter.max_bbox_per_scale, 4))
        batch_mbboxes = np.zeros((batch_size, parameter.max_bbox_per_scale, 4))
        batch_lbboxes = np.zeros((batch_size, parameter.max_bbox_per_scale, 4))

        num = 0
        batch_count = 0
        num_samples = len(annotations)
        num_batchs = int(np.ceil(num_samples / batch_size))
        if batch_count < num_batchs:
            while num < batch_size:
                index = batch_count * batch_size + num
                if index >= num_samples: 
                    index -= num_samples
                annotation = annotations[index]
                #从注释中得到图像和边框位置
                image, bboxes = parse_annotation(annotation,True,train_input_size)
                label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes \
                    = preprocess_true_boxes(bboxes,train_output_sizes)

                batch_image[num, :, :, :] = image
                batch_label_sbbox[num, :, :, :, :] = label_sbbox
                batch_label_mbbox[num, :, :, :, :] = label_mbbox
                batch_label_lbbox[num, :, :, :, :] = label_lbbox
                batch_sbboxes[num, :, :] = sbboxes
                batch_mbboxes[num, :, :] = mbboxes
                batch_lbboxes[num, :, :] = lbboxes
                num += 1
            batch_count += 1
            return batch_image, batch_label_sbbox, batch_label_mbbox, batch_label_lbbox, \
                   batch_sbboxes, batch_mbboxes, batch_lbboxes
        else:
            batch_count = 0
            np.random.shuffle(annotations)
            raise StopIteration

三、网络训练与损失函数

损失函数包括边界框GIOU损失函数、置信度focalloss损失函数，分类交叉熵损失函数，详情可以见参考链接[3]

    #类别不平衡问题 https://blog.csdn.net/qq_34914551/article/details/89049001
    def focal(self, target, actual, alpha=1, gamma=2):
        focal_loss = alpha * tf.pow(tf.abs(target - actual), gamma)
        return focal_loss
    
    #GIOU说明 https://zhuanlan.zhihu.com/p/80600110
    def bbox_giou(self, boxes1, boxes2):
        boxes1 = tf.concat([boxes1[..., :2] - boxes1[..., 2:] * 0.5,
                            boxes1[..., :2] + boxes1[..., 2:] * 0.5], axis=-1)
        boxes2 = tf.concat([boxes2[..., :2] - boxes2[..., 2:] * 0.5,
                            boxes2[..., :2] + boxes2[..., 2:] * 0.5], axis=-1)

        boxes1 = tf.concat([tf.minimum(boxes1[..., :2], boxes1[..., 2:]),
                            tf.maximum(boxes1[..., :2], boxes1[..., 2:])], axis=-1)
        boxes2 = tf.concat([tf.minimum(boxes2[..., :2], boxes2[..., 2:]),
                            tf.maximum(boxes2[..., :2], boxes2[..., 2:])], axis=-1)

        boxes1_area = (boxes1[..., 2] - boxes1[..., 0]) * (boxes1[..., 3] - boxes1[..., 1])
        boxes2_area = (boxes2[..., 2] - boxes2[..., 0]) * (boxes2[..., 3] - boxes2[..., 1])

        left_up = tf.maximum(boxes1[..., :2], boxes2[..., :2])
        right_down = tf.minimum(boxes1[..., 2:], boxes2[..., 2:])

        inter_section = tf.maximum(right_down - left_up, 0.0)
        inter_area = inter_section[..., 0] * inter_section[..., 1]
        union_area = boxes1_area + boxes2_area - inter_area
        iou = inter_area / union_area

        enclose_left_up = tf.minimum(boxes1[..., :2], boxes2[..., :2])
        enclose_right_down = tf.maximum(boxes1[..., 2:], boxes2[..., 2:])
        enclose = tf.maximum(enclose_right_down - enclose_left_up, 0.0)
        enclose_area = enclose[..., 0] * enclose[..., 1]
        giou = iou - 1.0 * (enclose_area - union_area) / enclose_area

        return giou

#conv 网络输出 pred 直接位置预测 label 标签 bboxes
    def loss_layer(self, conv, pred, label, bboxes, anchors, stride):
        conv_shape  = tf.shape(conv)
        batch_size  = conv_shape[0]
        output_size = conv_shape[1]
        input_size  = stride * output_size
        conv = tf.reshape(conv, (batch_size, output_size, output_size,
                                 self.anchor_per_scale, 5 + self.num_class))
        conv_raw_conf = conv[:, :, :, :, 4:5]
        conv_raw_prob = conv[:, :, :, :, 5:]

        pred_xywh     = pred[:, :, :, :, 0:4]
        pred_conf     = pred[:, :, :, :, 4:5]

        label_xywh    = label[:, :, :, :, 0:4]
        respond_bbox  = label[:, :, :, :, 4:5]
        label_prob    = label[:, :, :, :, 5:]

        #giouloss计算
        giou = tf.expand_dims(self.bbox_giou(pred_xywh, label_xywh), axis=-1)
        input_size = tf.cast(input_size, tf.float32)
        #2-相对面积
        bbox_loss_scale = 2.0 - 1.0 * label_xywh[:, :, :, :, 2:3] * label_xywh[:, :, :, :, 3:4] / (input_size ** 2)
        giou_loss = respond_bbox * bbox_loss_scale * (1- giou)

        iou = self.bbox_iou(pred_xywh[:, :, :, :, np.newaxis, :], bboxes[:, np.newaxis, np.newaxis, np.newaxis, :, :])
        #取每个格子三个预测框中iou最大的
        max_iou = tf.expand_dims(tf.reduce_max(iou, axis=-1), axis=-1)
        ## 如果最大的 iou 小于阈值，那么认为该预测框不包含物体,则为背景框
        respond_bgd = (1.0 - respond_bbox) * tf.cast( max_iou < self.iou_loss_thresh, tf.float32 )
        # 计算置信度的损失
        conf_focal = self.focal(respond_bbox, pred_conf)
        conf_loss = conf_focal * (
                respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits(labels=respond_bbox, logits=conv_raw_conf)
                +
                respond_bgd * tf.nn.sigmoid_cross_entropy_with_logits(labels=respond_bbox, logits=conv_raw_conf)
        )
        #类别损失
        prob_loss = respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits(labels=label_prob, logits=conv_raw_prob)

        giou_loss = tf.reduce_mean(tf.reduce_sum(giou_loss, axis=[1,2,3,4]))
        conf_loss = tf.reduce_mean(tf.reduce_sum(conf_loss, axis=[1,2,3,4]))
        prob_loss = tf.reduce_mean(tf.reduce_sum(prob_loss, axis=[1,2,3,4]))

        return giou_loss, conf_loss, prob_loss

四、一个小应用，利用https://github.com/YunYang1994/tensorflow-yolov3的权值文件，重新构建yolov3网络前向过程，将江南style视频变成江南style-yolov3模式，完整代码如下，视频结果可见https://www.bilibili.com/video/BV1754y1S7yF/

# -*- coding: utf-8 -*-

import tensorflow as tf
import numpy as np
import cv2
import colorsys
import random
from PIL import Image
from ffmpy import FFmpeg


config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
config.gpu_options.per_process_gpu_memory_fraction = 0.4
sess = tf.Session(config=config)

#读取目标类别
def read_class_names(class_file_name):
    names = {}
    with open(class_file_name, 'r') as data:
        for ID, name in enumerate(data):
            names[ID] = name.strip('\n')
            
    return names
 
#输入图像处理 统一尺寸   
def image_preporcess(image, target_size, gt_boxes=None):

   image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)

   ih, iw    = target_size
   h,  w, _  = image.shape

   scale = min(iw/w, ih/h)
   nw, nh  = int(scale * w), int(scale * h)
   image_resized = cv2.resize(image, (nw, nh))

   image_paded = np.full(shape=[ih, iw, 3], fill_value=128.0)
   dw, dh = (iw - nw) // 2, (ih-nh) // 2
   image_paded[dh:nh+dh, dw:nw+dw, :] = image_resized
   image_paded = image_paded / 255.

   if gt_boxes is None:
       return image_paded

   else:
       gt_boxes[:, [0, 2]] = gt_boxes[:, [0, 2]] * scale + dw
       gt_boxes[:, [1, 3]] = gt_boxes[:, [1, 3]] * scale + dh
       return image_paded, gt_boxes   


#YOLOV3结构 网络输入为图片数据 网络输出为预测框  pred_sbbox pred_mbbox pred_lbbox
class YOLOV3(object):#object 表示新式类
    def __init__(self,input_data,trainable):#self是类的实例引用
        self.trainable        = trainable
        self.classes          = read_class_names('coco.names')
        self.num_class        = len(self.classes)
        self.strides          = np.array([8, 16, 32])
        self.anchors          = np.array([1.25,1.625, 2.0,3.75, 4.125,2.875,
                                          1.875,3.8125, 3.875,2.8125, 3.6875,7.4375, 
                                          3.625,2.8125, 4.875,6.1875, 11.65625,10.1875]).reshape(3,3,2)
        self.anchor_per_scale = 3
        self.iou_loss_thresh  = 0.5
        self.upsample_method  = 'resize'

        self.conv_lbbox, self.conv_mbbox, self.conv_sbbox = self.__build_nework(input_data)

        with tf.variable_scope('pred_sbbox'):
            self.pred_sbbox = self.decode(self.conv_sbbox, self.anchors[0], self.strides[0])

        with tf.variable_scope('pred_mbbox'):
            self.pred_mbbox = self.decode(self.conv_mbbox, self.anchors[1], self.strides[1])

        with tf.variable_scope('pred_lbbox'):
            self.pred_lbbox = self.decode(self.conv_lbbox, self.anchors[2], self.strides[2])
    
    #DBL模块
    def convolutional(self, input_data, filters_shape, trainable, name, downsample=False, activate=True, bn=True):

        with tf.variable_scope(name):
            if downsample:
                pad_h, pad_w = (filters_shape[0] - 2) // 2 + 1, (filters_shape[1] - 2) // 2 + 1
                paddings = tf.constant([[0, 0], [pad_h, pad_h], [pad_w, pad_w], [0, 0]])
                input_data = tf.pad(input_data, paddings, 'CONSTANT')
                strides = (1, 2, 2, 1)
                padding = 'VALID'
            else:
                strides = (1, 1, 1, 1)
                padding = "SAME"
    
            weight = tf.get_variable(name='weight', dtype=tf.float32, trainable=True,
                                     shape=filters_shape, initializer=tf.random_normal_initializer(stddev=0.01))
            conv = tf.nn.conv2d(input=input_data, filter=weight, strides=strides, padding=padding)
    
            if bn:
                conv = tf.layers.batch_normalization(conv, beta_initializer=tf.zeros_initializer(),
                                                     gamma_initializer=tf.ones_initializer(),
                                                     moving_mean_initializer=tf.zeros_initializer(),
                                                     moving_variance_initializer=tf.ones_initializer(), training=trainable)
            else:
                bias = tf.get_variable(name='bias', shape=filters_shape[-1], trainable=True,
                                       dtype=tf.float32, initializer=tf.constant_initializer(0.0))
                conv = tf.nn.bias_add(conv, bias)
    
            if activate == True: conv = tf.nn.leaky_relu(conv, alpha=0.1)

        return conv
    
    #resnet模块
    def residual_block(self, input_data, input_channel, filter_num1, filter_num2, trainable, name):

        short_cut = input_data
    
        with tf.variable_scope(name):
            input_data = self.convolutional(input_data, filters_shape=(1, 1, input_channel, filter_num1),
                                       trainable=trainable, name='conv1')
            input_data = self.convolutional(input_data, filters_shape=(3, 3, filter_num1,   filter_num2),
                                       trainable=trainable, name='conv2')
    
            residual_output = input_data + short_cut
    
        return residual_output
    
    #concat模块
    def route(self, name, previous_output, current_output):
        with tf.variable_scope(name):
            output = tf.concat([current_output, previous_output], axis=-1)
    
        return output
    
    #上采样模块
    def upsample(self, input_data, name, method="deconv"):
        assert method in ["resize", "deconv"]
    
        if method == "resize":
            with tf.variable_scope(name):
                input_shape = tf.shape(input_data)
                output = tf.image.resize_nearest_neighbor(input_data, (input_shape[1] * 2, input_shape[2] * 2))
    
        if method == "deconv":
            # replace resize_nearest_neighbor with conv2d_transpose To support TensorRT optimization
            numm_filter = input_data.shape.as_list()[-1]
            output = tf.layers.conv2d_transpose(input_data, numm_filter, kernel_size=2, padding='same',
                                                strides=(2,2), kernel_initializer=tf.random_normal_initializer())
    
        return output

    def __build_nework(self, input_data):
        
        with tf.variable_scope('darknet'):
            input_data = self.convolutional(input_data, filters_shape=(3, 3,  3,  32), 
                                            trainable=self.trainable, name='conv0')
            input_data = self.convolutional(input_data, filters_shape=(3, 3, 32,  64),
                                              trainable=self.trainable, name='conv1', downsample=True)
    
            for i in range(1):
                input_data = self.residual_block(input_data,  64,  32, 64, 
                                                 trainable=self.trainable, name='residual%d' %(i+0))
    
            input_data = self.convolutional(input_data, filters_shape=(3, 3,  64, 128),
                                              trainable=self.trainable, name='conv4', downsample=True)
    
            for i in range(2):
                input_data = self.residual_block(input_data, 128,  64, 128, 
                                                 trainable=self.trainable, name='residual%d' %(i+1))
    
            input_data = self.convolutional(input_data, filters_shape=(3, 3, 128, 256),
                                              trainable=self.trainable, name='conv9', downsample=True)
    
            for i in range(8):
                input_data = self.residual_block(input_data, 256, 128, 256, 
                                                 trainable=self.trainable, name='residual%d' %(i+3))
    
            route_1 = input_data
            input_data = self.convolutional(input_data, filters_shape=(3, 3, 256, 512),
                                              trainable=self.trainable, name='conv26', downsample=True)
    
            for i in range(8):
                input_data = self.residual_block(input_data, 512, 256, 512, 
                                                 trainable=self.trainable, name='residual%d' %(i+11))
    
            route_2 = input_data
            input_data = self.convolutional(input_data, filters_shape=(3, 3, 512, 1024),
                                              trainable=self.trainable, name='conv43', downsample=True)
    
            for i in range(4):
                input_data = self.residual_block(input_data, 1024, 512, 1024, 
                                                 trainable=self.trainable, name='residual%d' %(i+19))
            
        input_data = self.convolutional(input_data, (1, 1, 1024,  512), trainable=self.trainable, name = 'conv52')
        input_data = self.convolutional(input_data, (3, 3,  512, 1024), trainable=self.trainable, name = 'conv53')
        input_data = self.convolutional(input_data, (1, 1, 1024,  512), trainable=self.trainable, name = 'conv54')
        input_data = self.convolutional(input_data, (3, 3,  512, 1024), trainable=self.trainable, name = 'conv55')
        input_data = self.convolutional(input_data, (1, 1, 1024,  512), trainable=self.trainable, name = 'conv56')

        conv_lobj_branch = self.convolutional(input_data, (3, 3, 512, 1024), 
                                              trainable=self.trainable, name='conv_lobj_branch')
        conv_lbbox = self.convolutional(conv_lobj_branch, (1, 1, 1024, 3*(self.num_class + 5)),
                                          trainable=self.trainable, name='conv_lbbox', activate=False, bn=False)

        input_data = self.convolutional(input_data, (1, 1,  512,  256), 
                                        trainable=self.trainable, name = 'conv57')
        input_data = self.upsample(input_data, name='upsample0', method=self.upsample_method)

        with tf.variable_scope('route_1'):
            input_data = tf.concat([input_data, route_2], axis=-1)

        input_data = self.convolutional(input_data, (1, 1, 768, 256), trainable=self.trainable, name = 'conv58')
        input_data = self.convolutional(input_data, (3, 3, 256, 512), trainable=self.trainable, name = 'conv59')
        input_data = self.convolutional(input_data, (1, 1, 512, 256), trainable=self.trainable, name = 'conv60')
        input_data = self.convolutional(input_data, (3, 3, 256, 512), trainable=self.trainable, name = 'conv61')
        input_data = self.convolutional(input_data, (1, 1, 512, 256), trainable=self.trainable, name = 'conv62')

        conv_mobj_branch = self.convolutional(input_data, (3, 3, 256, 512),  
                                              trainable=self.trainable, name='conv_mobj_branch' )
        conv_mbbox = self.convolutional(conv_mobj_branch, (1, 1, 512, 3*(self.num_class + 5)),
                                          trainable=self.trainable, name='conv_mbbox', activate=False, bn=False)

        input_data = self.convolutional(input_data, (1, 1, 256, 128), trainable=self.trainable, name = 'conv63')
        input_data = self.upsample(input_data, name='upsample1', method=self.upsample_method)

        with tf.variable_scope('route_2'):
            input_data = tf.concat([input_data, route_1], axis=-1)

        input_data = self.convolutional(input_data, (1, 1, 384, 128), trainable=self.trainable, name = 'conv64')
        input_data = self.convolutional(input_data, (3, 3, 128, 256), trainable=self.trainable, name = 'conv65')
        input_data = self.convolutional(input_data, (1, 1, 256, 128), trainable=self.trainable, name = 'conv66')
        input_data = self.convolutional(input_data, (3, 3, 128, 256), trainable=self.trainable, name = 'conv67')
        input_data = self.convolutional(input_data, (1, 1, 256, 128), trainable=self.trainable, name = 'conv68')

        conv_sobj_branch = self.convolutional(input_data, (3, 3, 128, 256), trainable=self.trainable, name='conv_sobj_branch')
        conv_sbbox = self.convolutional(conv_sobj_branch, (1, 1, 256, 3*(self.num_class + 5)),
                                          trainable=self.trainable, name='conv_sbbox', activate=False, bn=False)
        
        return conv_lbbox,conv_mbbox,conv_sbbox
    
    def decode(self, conv_output, anchors, stride):
        conv_shape       = tf.shape(conv_output)
        batch_size       = conv_shape[0]
        output_size      = conv_shape[1]
        anchor_per_scale = len(anchors)
        #输出为批数目 网格坐标x 网格坐标y  三个框信息3*(80+5)=255
        conv_output = tf.reshape(conv_output, (batch_size, output_size, output_size, anchor_per_scale, 5 + self.num_class))

        #批数目 网格坐标x 网格坐标y 该尺度预测框序号（共三个） 框信息5+80（坐标偏移量，高宽缩放，置信度，类别概率）
        conv_raw_dxdy = conv_output[:, :, :, :, 0:2]
        conv_raw_dwdh = conv_output[:, :, :, :, 2:4]
        conv_raw_conf = conv_output[:, :, :, :, 4:5]
        conv_raw_prob = conv_output[:, :, :, :, 5: ]

        y = tf.tile(tf.range(output_size, dtype=tf.int32)[:, tf.newaxis], [1, output_size])
        x = tf.tile(tf.range(output_size, dtype=tf.int32)[tf.newaxis, :], [output_size, 1])

        xy_grid = tf.concat([x[:, :, tf.newaxis], y[:, :, tf.newaxis]], axis=-1)
        xy_grid = tf.tile(xy_grid[tf.newaxis, :, :, tf.newaxis, :], [batch_size, 1, 1, anchor_per_scale, 1])
        xy_grid = tf.cast(xy_grid, tf.float32)

        pred_xy = (tf.sigmoid(conv_raw_dxdy) + xy_grid) * stride
        pred_wh = (tf.exp(conv_raw_dwdh) * anchors) * stride
        pred_xywh = tf.concat([pred_xy, pred_wh], axis=-1)

        pred_conf = tf.sigmoid(conv_raw_conf)
        pred_prob = tf.sigmoid(conv_raw_prob)
        #输出为 批数目 网格坐标x 网格坐标y 该尺度预测框序号（共三个） 框信息5+80（坐标，高宽，置信度，类别概率）
        return tf.concat([pred_xywh, pred_conf, pred_prob], axis=-1)

#对网络的输出进行处理 去除一些框 基于原图去除
def postprocess_boxes(pred_bbox, org_img_shape, input_size, score_threshold):

    valid_scale=[0, np.inf]
    pred_bbox = np.array(pred_bbox)

    pred_xywh = pred_bbox[:, 0:4]
    pred_conf = pred_bbox[:, 4]
    pred_prob = pred_bbox[:, 5:]

    # # (1) (x, y, w, h) --> (xmin, ymin, xmax, ymax)
    pred_coor = np.concatenate([pred_xywh[:, :2] - pred_xywh[:, 2:] * 0.5,
                                pred_xywh[:, :2] + pred_xywh[:, 2:] * 0.5], axis=-1)
    # # (2) (xmin, ymin, xmax, ymax) -> (xmin_org, ymin_org, xmax_org, ymax_org)
    org_h, org_w = org_img_shape
    resize_ratio = min(input_size / org_w, input_size / org_h)

    dw = (input_size - resize_ratio * org_w) / 2
    dh = (input_size - resize_ratio * org_h) / 2

    pred_coor[:, 0::2] = 1.0 * (pred_coor[:, 0::2] - dw) / resize_ratio
    pred_coor[:, 1::2] = 1.0 * (pred_coor[:, 1::2] - dh) / resize_ratio

    # # (3) clip some boxes those are out of range
    pred_coor = np.concatenate([np.maximum(pred_coor[:, :2], [0, 0]),
                                np.minimum(pred_coor[:, 2:], [org_w - 1, org_h - 1])], axis=-1)
    invalid_mask = np.logical_or((pred_coor[:, 0] > pred_coor[:, 2]), (pred_coor[:, 1] > pred_coor[:, 3]))
    pred_coor[invalid_mask] = 0

    # # (4) discard some invalid boxes
    bboxes_scale = np.sqrt(np.multiply.reduce(pred_coor[:, 2:4] - pred_coor[:, 0:2], axis=-1))
    scale_mask = np.logical_and((valid_scale[0] < bboxes_scale), (bboxes_scale < valid_scale[1]))

    # # (5) discard some boxes with low scores
    classes = np.argmax(pred_prob, axis=-1)
    scores = pred_conf * pred_prob[np.arange(len(pred_coor)), classes]
    score_mask = scores > score_threshold
    mask = np.logical_and(scale_mask, score_mask)
    coors, scores, classes = pred_coor[mask], scores[mask], classes[mask]

    return np.concatenate([coors, scores[:, np.newaxis], classes[:, np.newaxis]], axis=-1)

#iou计算用于nms
def bboxes_iou(boxes1, boxes2):

    boxes1 = np.array(boxes1)
    boxes2 = np.array(boxes2)

    boxes1_area = (boxes1[..., 2] - boxes1[..., 0]) * (boxes1[..., 3] - boxes1[..., 1])
    boxes2_area = (boxes2[..., 2] - boxes2[..., 0]) * (boxes2[..., 3] - boxes2[..., 1])

    left_up       = np.maximum(boxes1[..., :2], boxes2[..., :2])
    right_down    = np.minimum(boxes1[..., 2:], boxes2[..., 2:])

    inter_section = np.maximum(right_down - left_up, 0.0)
    inter_area    = inter_section[..., 0] * inter_section[..., 1]
    union_area    = boxes1_area + boxes2_area - inter_area
    ious          = np.maximum(1.0 * inter_area / union_area, np.finfo(np.float32).eps)

    return ious

#对输出框进行nms处理
def nms(bboxes, iou_threshold, sigma=0.3, method='nms'):
    """
    :param bboxes: (xmin, ymin, xmax, ymax, score, class)

    Note: soft-nms, https://arxiv.org/pdf/1704.04503.pdf
          https://github.com/bharatsingh430/soft-nms
    """
    classes_in_img = list(set(bboxes[:, 5]))
    best_bboxes = []

    for cls in classes_in_img:
        cls_mask = (bboxes[:, 5] == cls)
        cls_bboxes = bboxes[cls_mask]

        while len(cls_bboxes) > 0:
            max_ind = np.argmax(cls_bboxes[:, 4])
            best_bbox = cls_bboxes[max_ind]
            best_bboxes.append(best_bbox)
            cls_bboxes = np.concatenate([cls_bboxes[: max_ind], cls_bboxes[max_ind + 1:]])
            iou = bboxes_iou(best_bbox[np.newaxis, :4], cls_bboxes[:, :4])
            weight = np.ones((len(iou),), dtype=np.float32)

            assert method in ['nms', 'soft-nms']

            if method == 'nms':
                iou_mask = iou > iou_threshold
                weight[iou_mask] = 0.0

            if method == 'soft-nms':
                weight = np.exp(-(1.0 * iou ** 2 / sigma))

            cls_bboxes[:, 4] = cls_bboxes[:, 4] * weight
            score_mask = cls_bboxes[:, 4] > 0.
            cls_bboxes = cls_bboxes[score_mask]

    return best_bboxes

#画框框
def draw_bbox(image, bboxes, classes=read_class_names('coco.names'), show_label=True):
    """
    bboxes: [x_min, y_min, x_max, y_max, probability, cls_id] format coordinates.
    """

    num_classes = len(classes)
    image_h, image_w, _ = image.shape
    hsv_tuples = [(1.0 * x / num_classes, 1., 1.) for x in range(num_classes)]
    colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples))
    colors = list(map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), colors))

    random.seed(0)
    random.shuffle(colors)
    random.seed(None)

    for i, bbox in enumerate(bboxes):
        coor = np.array(bbox[:4], dtype=np.int32)
        fontScale = 0.5
        score = bbox[4]
        class_ind = int(bbox[5])
        bbox_color = colors[class_ind]
        bbox_thick = int(0.6 * (image_h + image_w) / 600)
        c1, c2 = (coor[0], coor[1]), (coor[2], coor[3])
        cv2.rectangle(image, c1, c2, bbox_color, bbox_thick)

        if show_label:
            bbox_mess = '%s: %.2f' % (classes[class_ind], score)
            t_size = cv2.getTextSize(bbox_mess, 0, fontScale, thickness=bbox_thick//2)[0]
            cv2.rectangle(image, c1, (c1[0] + t_size[0], c1[1] - t_size[1] - 3), bbox_color, -1)  # filled

            cv2.putText(image, bbox_mess, (c1[0], c1[1]-2), cv2.FONT_HERSHEY_SIMPLEX,
                        fontScale, (0, 0, 0), bbox_thick//2, lineType=cv2.LINE_AA)

    return image    


#音视频分离
def open_audio_video(mp4_file_path, video_path, audio_path):
    ff = FFmpeg(
        inputs={mp4_file_path:None},
        outputs={
            audio_path:['-map', '0:0', '-c:a', 'copy', '-f', 'mp4'],
            video_path:['-map', '0:1', '-c:a', 'copy', '-f', 'mp4']
        }
    )
    ff.run()

#音视频合并
def close_audio_video(out_mp4_file_path, video_path, audio_path):
    ff = FFmpeg(
        inputs={
            audio_path:None,
            video_path:None
        },
        outputs={out_mp4_file_path:'-c:v h264 -c:a aac'}
    )
    ff.run()

if __name__ == '__main__':
    
    img_video = False        
    with tf.Graph().as_default():
        #构建计算图
        input_data = tf.placeholder(dtype=tf.float32, name='input_data')
        training = tf.placeholder(dtype=tf.bool, name='trainable')
        network = YOLOV3(input_data,training)
        saver = tf.train.Saver()
        with tf.Session() as sess:
            saver.restore(sess, './checkpoint/yolov3_coco_demo.ckpt')
            if(img_video):
                #读取图片
                original_image = cv2.imread('road.jpeg')
                #颜色格式处理
                original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB)
                original_image_size = original_image.shape[:2]
                #图片预处理尺寸调整到416*416
                image_data = image_preporcess(np.copy(original_image), [416, 416])
                #创建批次维度
                image_data = image_data[np.newaxis, ...]
                #网络输出
                pred_sbbox,pred_mbbox,pred_lbbox = \
                sess.run([network.pred_sbbox,network.pred_mbbox,network.pred_lbbox],\
                         feed_dict={input_data:image_data,training:False})
                #合并三尺度的预测框
                pred_bbox = np.concatenate([np.reshape(pred_sbbox, (-1, 85)),
                                    np.reshape(pred_mbbox, (-1, 85)),
                                    np.reshape(pred_lbbox, (-1, 85))], axis=0)
                #预测框后处理与NMS
                bboxes = postprocess_boxes(pred_bbox, original_image_size, 416, 0.3)
                bboxes = nms(bboxes, 0.45, method='nms')
                #画在原始图片上
                image = draw_bbox(original_image, bboxes)
                image1 = Image.fromarray(image)
                image1.show()
            else:
                video_file_path = 'road.mp4'
                video_path = 'video.mp4'
                audio_path = 'audio.mp4'
                #音视频分离
                open_audio_video(video_file_path, video_path, audio_path)
                #VideoCapture()中参数是0，表示打开笔记本的内置摄像头，参数是视频文件路径则打开视频
                vid = cv2.VideoCapture(audio_path)
                fps = vid.get(cv2.CAP_PROP_FPS)  #返回视频的fps--帧率
                size=vid.get(cv2.CAP_PROP_FRAME_WIDTH)  #返回视频的宽
                size1=vid.get(cv2.CAP_PROP_FRAME_HEIGHT)  #返回视频的高
                #XVID
                video2 = cv2.VideoWriter('PSY.avi', cv2.VideoWriter_fourcc('M', 'P', '4', '2'), int(fps), (int(size),int(size1))) #创建视频流对象
                while True:
                    #vc.read()按帧读取视频，ret,frame是获cap.read()方法的两个返回值。
                    #其中ret是布尔值，如果读取帧是正确的则返回True，如果文件读取到结尾，它的返回值就为False。frame就是每一帧的图像，是个三维矩阵。
                    return_value, frame = vid.read()
                    if return_value:
                        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                        image = Image.fromarray(frame)
                    else:
                        raise ValueError("No image!")
                    frame_size = frame.shape[:2]
                    image_data = image_preporcess(np.copy(frame), [416, 416])
                    image_data = image_data[np.newaxis, ...]

                    pred_sbbox,pred_mbbox,pred_lbbox = \
                    sess.run([network.pred_sbbox,network.pred_mbbox,network.pred_lbbox],\
                             feed_dict={input_data:image_data,training:False})
                                
                    pred_bbox = np.concatenate([np.reshape(pred_sbbox, (-1, 85)),
                                                np.reshape(pred_mbbox, (-1, 85)),
                                                np.reshape(pred_lbbox, (-1, 85))], axis=0)
            
                    bboxes = postprocess_boxes(pred_bbox, frame_size, 416, 0.3)
                    bboxes = nms(bboxes, 0.45, method='nms')
                    image = draw_bbox(frame, bboxes)
            
                    result = np.asarray(image)
                    cv2.namedWindow("result", cv2.WINDOW_AUTOSIZE)
                    result = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
                    cv2.imshow("result", result)
                    video2.write(result)   # 向视频文件写入一帧--只有图像，没有声音
                    if cv2.waitKey(int(1000/fps)) & 0xFF == ord('q'): break#20ms读取下一帧图片
                vid.release()  # 释放视频流
                cv2.destroyAllWindows()   # 关闭所有窗口
                #音视频合并 放到命令行执行下
                close_audio_video('PSY_yolo_style.avi', video_path,'PSY.avi')

参考链接：

[1]https://zhuanlan.zhihu.com/p/79425557

[2]https://zhuanlan.zhihu.com/p/80056633

[3]https://zhuanlan.zhihu.com/p/80208709

[4]https://zhuanlan.zhihu.com/p/80600110

[5]https://github.com/YunYang1994/tensorflow-yolov3

[6]https://blog.csdn.net/leviopku/article/details/82660381

[7]https://www.bilibili.com/video/BV1i64y1u7Zj?p=2

li_jixing_0425

关注

2
点赞
踩
7

收藏

觉得还不错? 一键收藏
0
评论
yolov3(tensorflow1)的代码阅读

本文主要通过啃https://github.com/YunYang1994/tensorflow-yolov3代码来学习了下yolov3，此外从B站up主https://www.bilibili.com/video/BV1i64y1u7Zj?p=2学习了目标检测的基础知识，例如NMS,IOU,YOLO发展等。一、网络结构，前向计算过程上述结构图源自参考链接[6]，个人总结以下特点：Darknet-53主要为残差网络结构；DBL网络块中利用卷积代替池化进行2倍下采样；3尺度输出，每一种尺度对应
复制链接

扫一扫