TensorFlow-YOLO_V2测试代码梳理

TensorFlow-YOLO_V2测试代码梳理,相信看过后对yolo-v2的整个流程都一目了然了。

硬件:NVIDIA-GTX1080

软件:Windows7、python3.6.5、tensorflow-gpu-1.4.0

一、基础知识

1、anchor boxes (w, h)

通过kmeans聚类方法计算。kmeans计算的是两个数值之间的距离,通过迭代距离最小形成聚类,而目标检测的边界框不是点与点之间聚类,而是框与框之间的聚类,所以作者采用IOU作为聚类的标准,要求IOU越大,距离越小,所以作者给出如下公式:

 

2、location prediction

在V2的边框回归中,中心点坐标x,y回归相对于V1未曾改变,改变的是w,h回归。V2的宽高回归是相对与anchor boxes而言的,此时的w,h均是anchor boxes中w,h的系数,相乘的结果就是YOLO生成的标签

3、Multi-Scale Training

V2在训练过程中每间隔一定的iterations之后改变模型的输入图片大小。由于YOLOv2的下采样总步长为32,输入图片大小选择一系列为32倍数的值,如320,352,。。。,608,当输入图像为608x608时,输出特征图为19x19,此时的boundingbox数量达到了19x19x5=1805个。也就是说,V2的一个模型可以满足大小不同的输入,从而可以根据实际情况选取相应的模型进行计算。

二、代码展示(做了详细注释)

1、demo.py

"""
Demo for yolov2
"""

import numpy as np
import tensorflow as tf
import cv2
from PIL import Image

from model import darknet
from detect_ops import decode
from utils import preprocess_image, postprocess, draw_detection
from config import anchors, class_names


input_size = (416, 416)
image_file = "timg.jpg"
image = cv2.imread(image_file)
image_shape = image.shape[:2]
# for test
image_cp = preprocess_image(image, input_size)
"""
image = Image.open(image_file)
image_cp = image.resize(input_size, Image.BICUBIC)
image_cp = np.array(image_cp, dtype=np.float32)/255.0
image_cp = np.expand_dims(image_cp, 0)
#print(image_cp)
"""

images = tf.placeholder(tf.float32, [1, input_size[0], input_size[1], 3])
detection_feat = darknet(images)
feat_sizes = input_size[0] // 32, input_size[1] // 32
detection_results = decode(detection_feat, feat_sizes, len(class_names), anchors)

checkpoint_path = "./checkpoint_dir/yolo2_coco.ckpt"
saver = tf.train.Saver()
with tf.Session() as sess:
    saver.restore(sess, checkpoint_path)
    bboxes, obj_probs, class_probs = sess.run(detection_results, feed_dict={images: image_cp})

bboxes, scores, class_inds = postprocess(bboxes, obj_probs, class_probs, image_shape=image_shape)
img_detection = draw_detection(image, bboxes, scores, class_inds, class_names)
cv2.imwrite("detection.jpg", img_detection)
cv2.imshow("detection results", img_detection)

cv2.waitKey(0)

2、config.py

"""
Yolov2 anchors and coco classes
"""

"""
anchors = [[0.738768, 0.874946],
           [2.42204, 2.65704],
           [4.30971, 7.04493],
           [10.246, 4.59428],
           [12.6868, 11.8741]]
"""
anchors = [[0.57273, 0.677385],
           [1.87446, 2.06253],
           [3.33843, 5.47434],
           [7.88282, 3.52778],
           [9.77052, 9.16828]]

def read_coco_labels():
    f = open("./data/coco_classes.txt")
    class_names = []
    for l in f.readlines():
        class_names.append(l[:-1])
    return class_names

class_names = read_coco_labels()

3、utils.py

"""
Help functions for YOLOv2
"""
import random
import colorsys

import cv2
import numpy as np



############## preprocess image ##################


def preprocess_image(image, image_size=(416, 416)):
    """Preprocess a image to inference"""
    image_cp = np.copy(image).astype(np.float32)
    # resize the image
    image_rgb = cv2.cvtColor(image_cp, cv2.COLOR_BGR2RGB)
    image_resized = cv2.resize(image_rgb, image_size)
    # normalize
    image_normalized = image_resized.astype(np.float32) / 255.0
    # expand the batch_size dim
    # 1, 416, 416, 3
    image_expanded = np.expand_dims(image_normalized, axis=0)
    return image_expanded

def postprocess(bboxes, obj_probs, class_probs, image_shape=(416, 416),
                threshold=0.2):
    """post process the detection results"""
    # 1,169,5,4 -> 1x169x5,4
    bboxes = np.reshape(bboxes, [-1, 4])
    bboxes[:, 0::2] *= float(image_shape[1]) # for xmin,xmax (*image_shape[1]),若是416,则在输出时需要乘宽比例系数,因为比例都是相同的
    bboxes[:, 1::2] *= float(image_shape[0]) # for ymin,ymax (*image_shape[0]),若是416,则在输出时需要乘高比例系数,因为比例都是相同的
    bboxes = bboxes.astype(np.int32)

    # clip the bboxs
    # 超出或不足图像边界的像素点重置
    bbox_ref = [0, 0, image_shape[1] - 1, image_shape[0] - 1]
    bboxes = bboxes_clip(bbox_ref, bboxes)

    # 1,169,5,1 -> 1*169*5*1
    obj_probs = np.reshape(obj_probs, [-1])
    # 1,169,5,80 -> 1*169*5*1,80
    class_probs = np.reshape(class_probs, [len(obj_probs), -1])
    # 1*169*5*1,80 -> 1*169*5*1  找到最大值的索引
    class_inds = np.argmax(class_probs, axis=1)
    # 1*169*5*1 对应真实标签的条件概率
    class_probs = class_probs[np.arange(len(obj_probs)), class_inds]
    # 1*169*5*1 条件概率*IOU = 得分
    scores = obj_probs * class_probs

    # filter bboxes with scores > threshold
    # 大于阈值的保留
    keep_inds = scores > threshold
    bboxes = bboxes[keep_inds]
    scores = scores[keep_inds]
    class_inds = class_inds[keep_inds]

    # 从大到小排列top_k
    class_inds, scores, bboxes = bboxes_sort(class_inds, scores, bboxes)
    # nms (所有类别一视同仁!!!有问题???)
    class_inds, scores, bboxes = bboxes_nms(class_inds, scores, bboxes)

    return bboxes, scores, class_inds

def draw_detection(im, bboxes, scores, cls_inds, labels, thr=0.3):
    # for display
    ############################
    # Generate colors for drawing bounding boxes.
    hsv_tuples = [(x / float(len(labels)), 1., 1.)
                  for x in range(len(labels))]
    colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples))
    colors = list(
        map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)),
            colors))
    random.seed(10101)  # Fixed seed for consistent colors across runs.
    random.shuffle(colors)  # Shuffle colors to decorrelate adjacent classes.
    random.seed(None)  # Reset seed to default.
    # draw image
    imgcv = np.copy(im)
    h, w, _ = imgcv.shape
    for i, box in enumerate(bboxes):
        if scores[i] < thr:
            continue
        cls_indx = cls_inds[i]

        thick = int((h + w) / 300)
        cv2.rectangle(imgcv,
                      (box[0], box[1]), (box[2], box[3]),
                      colors[cls_indx], thick)
        mess = '%s: %.3f' % (labels[cls_indx], scores[i])
        if box[1] < 20:
            text_loc = (box[0] + 2, box[1] + 15)
        else:
            text_loc = (box[0], box[1] - 10)
        cv2.putText(imgcv, mess, text_loc,
                    cv2.FONT_HERSHEY_SIMPLEX, 1e-3 * h, colors[cls_indx], thick // 3)

    return imgcv


############## process bboxes ##################
def bboxes_clip(bbox_ref, bboxes):
    """Clip bounding boxes with respect to reference bbox.
    """
    bboxes = np.copy(bboxes)
    bboxes = np.transpose(bboxes)
    bbox_ref = np.transpose(bbox_ref)
    bboxes[0] = np.maximum(bboxes[0], bbox_ref[0])
    bboxes[1] = np.maximum(bboxes[1], bbox_ref[1])
    bboxes[2] = np.minimum(bboxes[2], bbox_ref[2])
    bboxes[3] = np.minimum(bboxes[3], bbox_ref[3])
    bboxes = np.transpose(bboxes)
    return bboxes

def bboxes_sort(classes, scores, bboxes, top_k=400):
    """Sort bounding boxes by decreasing order and keep only the top_k
    """
    # if priority_inside:
    #     inside = (bboxes[:, 0] > margin) & (bboxes[:, 1] > margin) & \
    #         (bboxes[:, 2] < 1-margin) & (bboxes[:, 3] < 1-margin)
    #     idxes = np.argsort(-scores)
    #     inside = inside[idxes]
    #     idxes = np.concatenate([idxes[inside], idxes[~inside]])
    idxes = np.argsort(-scores)
    classes = classes[idxes][:top_k]
    scores = scores[idxes][:top_k]
    bboxes = bboxes[idxes][:top_k]
    return classes, scores, bboxes

def bboxes_iou(bboxes1, bboxes2):
    """Computing iou between bboxes1 and bboxes2.
    Note: bboxes1 and bboxes2 can be multi-dimensional, but should broacastable.
    """
    bboxes1 = np.transpose(bboxes1)
    bboxes2 = np.transpose(bboxes2)
    # Intersection bbox and volume.
    int_ymin = np.maximum(bboxes1[0], bboxes2[0])
    int_xmin = np.maximum(bboxes1[1], bboxes2[1])
    int_ymax = np.minimum(bboxes1[2], bboxes2[2])
    int_xmax = np.minimum(bboxes1[3], bboxes2[3])

    int_h = np.maximum(int_ymax - int_ymin, 0.)
    int_w = np.maximum(int_xmax - int_xmin, 0.)
    int_vol = int_h * int_w
    # Union volume.
    vol1 = (bboxes1[2] - bboxes1[0]) * (bboxes1[3] - bboxes1[1])
    vol2 = (bboxes2[2] - bboxes2[0]) * (bboxes2[3] - bboxes2[1])
    iou = int_vol / (vol1 + vol2 - int_vol)
    return iou

def bboxes_nms(classes, scores, bboxes, nms_threshold=0.5):
    """Apply non-maximum selection to bounding boxes.
    """
    keep_bboxes = np.ones(scores.shape, dtype=np.bool)
    for i in range(scores.size-1):
        if keep_bboxes[i]:
            # Computer overlap with bboxes which are following.
            overlap = bboxes_iou(bboxes[i], bboxes[(i+1):])
            # Overlap threshold for keeping + checking part of the same class
            keep_overlap = np.logical_or(overlap < nms_threshold, classes[(i+1):] != classes[i])
            keep_bboxes[(i+1):] = np.logical_and(keep_bboxes[(i+1):], keep_overlap)

    idxes = np.where(keep_bboxes)
    return classes[idxes], scores[idxes], bboxes[idxes]

4、model.py

"""
YOLOv2 implemented by Tensorflow, only for predicting
"""
import os

import numpy as np
import tensorflow as tf

######## basic layers #######

def leaky_relu(x):
    return tf.nn.leaky_relu(x, alpha=0.1, name="leaky_relu")

# Conv2d
def conv2d(x, filters, size, pad=0, stride=1, batch_normalize=1,
           activation=leaky_relu, use_bias=False, name="conv2d"):
    if pad > 0:
        x = tf.pad(x, [[0, 0], [pad, pad], [pad, pad], [0, 0]])
    out = tf.layers.conv2d(x, filters, size, strides=stride, padding="VALID",
                           activation=None, use_bias=use_bias, name=name)
    if batch_normalize == 1:
        out = tf.layers.batch_normalization(out, axis=-1, momentum=0.9,
                                            training=False, name=name+"_bn")
    if activation:
        out = activation(out)
    return out

# maxpool2d
def maxpool(x, size=2, stride=2, name="maxpool"):
    return tf.layers.max_pooling2d(x, size, stride)

# reorg layer
# 将数据重组,每个块的每个像素进行重组,x0[0] + x1[0] + x2[0] + x3[0]...形成新的特征图
# https://zhuanlan.zhihu.com/p/35325884
def reorg(x, ksize, stride, rate):
    return tf.extract_image_patches(x, [1, ksize, ksize, 1],
                        [1, stride, stride, 1], [1,rate,rate,1], padding="VALID")


def darknet(images, n_last_channels=425):
    """Darknet19 for YOLOv2"""
    net = conv2d(images, 32, 3, 1, name="conv1")
    net = maxpool(net, name="pool1")
    net = conv2d(net, 64, 3, 1, name="conv2")
    net = maxpool(net, name="pool2")
    net = conv2d(net, 128, 3, 1, name="conv3_1")
    net = conv2d(net, 64, 1, name="conv3_2")
    net = conv2d(net, 128, 3, 1, name="conv3_3")
    net = maxpool(net, name="pool3")
    net = conv2d(net, 256, 3, 1, name="conv4_1")
    net = conv2d(net, 128, 1, name="conv4_2")
    net = conv2d(net, 256, 3, 1, name="conv4_3")
    net = maxpool(net, name="pool4")
    net = conv2d(net, 512, 3, 1, name="conv5_1")
    net = conv2d(net, 256, 1, name="conv5_2")
    net = conv2d(net, 512, 3, 1, name="conv5_3")
    net = conv2d(net, 256, 1, name="conv5_4")
    net = conv2d(net, 512, 3, 1, name="conv5_5")

    # 1,26,26,512
    shortcut = net
    
    net = maxpool(net, name="pool5")
    net = conv2d(net, 1024, 3, 1, name="conv6_1")
    net = conv2d(net, 512, 1, name="conv6_2")
    net = conv2d(net, 1024, 3, 1, name="conv6_3")
    net = conv2d(net, 512, 1, name="conv6_4")
    net = conv2d(net, 1024, 3, 1, name="conv6_5")
    # ---------
    net = conv2d(net, 1024, 3, 1, name="conv7_1")
    # 1,13,13,1024
    net = conv2d(net, 1024, 3, 1, name="conv7_2")
    
    # shortcut
    # 1,26,26,512 -> 1,26,26,64
    shortcut = conv2d(shortcut, 64, 1, name="conv_shortcut")
    # 1,26,26,64 -> 1,13,13,256
    shortcut = reorg(shortcut, 2, 2, 1)

    # 小物体还需要更精细的特征图
    # 1,13,13,256 + 1,13,13,1024 = 1,13,13,1280 (resnet)
    net = tf.concat([shortcut, net], axis=-1)
    
    net = conv2d(net, 1024, 3, 1, name="conv8")
    # detection layer
    net = conv2d(net, n_last_channels, 1, batch_normalize=0,
                 activation=None, use_bias=True, name="conv_dec")
    return net

'''
if __name__ == "__main__":
    x = tf.random_normal([1, 416, 416, 3])
    model = darknet(x)

    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, "./checkpoint_dir/yolo2_coco.ckpt")
        print(sess.run(model).shape)
'''

5、detect_ops.py

"""
Detection ops for Yolov2
"""

import tensorflow as tf
import numpy as np


def decode(detection_feat, feat_sizes=(13, 13), num_classes=80,
           anchors=None):
    """decode from the detection feature"""
    H, W = feat_sizes
    num_anchors = len(anchors)
    # 1,13,13,425 -> 1,169,5,85
    detetion_results = tf.reshape(detection_feat, [-1, H * W, num_anchors,
                                        num_classes + 5])
    
    # 四个值在线计算,无需反馈计算
    bbox_xy = tf.nn.sigmoid(detetion_results[:, :, :, 0:2])
    bbox_wh = tf.exp(detetion_results[:, :, :, 2:4])
    obj_probs = tf.nn.sigmoid(detetion_results[:, :, :, 4])
    class_probs = tf.nn.softmax(detetion_results[:, :, :, 5:])

    anchors = tf.constant(anchors, dtype=tf.float32)

    height_ind = tf.range(H, dtype=tf.float32)
    width_ind = tf.range(W, dtype=tf.float32)
    x_offset, y_offset = tf.meshgrid(height_ind, width_ind)
    x_offset = tf.reshape(x_offset, [1, -1, 1])# 1,169,1
    y_offset = tf.reshape(y_offset, [1, -1, 1])# 1,169,1

    # decode
    # x,y相对各自单元格的坐标,与yolov1一样
    bbox_x = (bbox_xy[:, :, :, 0] + x_offset) / W
    bbox_y = (bbox_xy[:, :, :, 1] + y_offset) / H
    # w,h相对各自的anchor而言,与yolov1不同
    bbox_w = bbox_wh[:, :, :, 0] * anchors[:, 0] / W * 0.5
    bbox_h = bbox_wh[:, :, :, 1] * anchors[:, 1] / H * 0.5

    # bboxes相对13x13的特征图左上右下坐标
    # 1,169,5,4
    bboxes = tf.stack([bbox_x - bbox_w, bbox_y - bbox_h,
                       bbox_x + bbox_w, bbox_y + bbox_h], axis=3)

    return bboxes, obj_probs, class_probs

三、模型下载

https://pan.baidu.com/s/1ZeT5HerjQxyUZ_L9d3X52w

四、结果展示

五、参考

https://zhuanlan.zhihu.com/p/35325884

 

任何问题请加唯一QQ2258205918(名称samylee)!

唯一VX:samylee_csdn

评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值