1、anchor boxes (w, h)
2、location prediction
在V2的边框回归中,中心点坐标x,y回归相对于V1未曾改变,改变的是w,h回归。V2的宽高回归是相对与anchor boxes而言的,此时的w,h均是anchor boxes中w,h的系数,相乘的结果就是YOLO生成的标签。
3、Multi-Scale Training
Demo for yolov2
import numpy as np
import tensorflow as tf
import cv2
from PIL import Image
from model import darknet
from detect_ops import decode
from utils import preprocess_image, postprocess, draw_detection
from config import anchors, class_names
input_size = (416, 416)
image_file = "timg.jpg"
image = cv2.imread(image_file)
image_shape = image.shape[:2]
# for test
image_cp = preprocess_image(image, input_size)
image = Image.open(image_file)
image_cp = image.resize(input_size, Image.BICUBIC)
image_cp = np.array(image_cp, dtype=np.float32)/255.0
image_cp = np.expand_dims(image_cp, 0)
images = tf.placeholder(tf.float32, [1, input_size[0], input_size[1], 3])
detection_feat = darknet(images)
feat_sizes = input_size[0] // 32, input_size[1] // 32
detection_results = decode(detection_feat, feat_sizes, len(class_names), anchors)
checkpoint_path = "./checkpoint_dir/yolo2_coco.ckpt"
saver = tf.train.Saver()
with tf.Session() as sess:
saver.restore(sess, checkpoint_path)
bboxes, obj_probs, class_probs = sess.run(detection_results, feed_dict={images: image_cp})
bboxes, scores, class_inds = postprocess(bboxes, obj_probs, class_probs, image_shape=image_shape)
img_detection = draw_detection(image, bboxes, scores, class_inds, class_names)
cv2.imwrite("detection.jpg", img_detection)
cv2.imshow("detection results", img_detection)
Yolov2 anchors and coco classes
anchors = [[0.738768, 0.874946],
[2.42204, 2.65704],
[4.30971, 7.04493],
[10.246, 4.59428],
[12.6868, 11.8741]]
anchors = [[0.57273, 0.677385],
[1.87446, 2.06253],
[3.33843, 5.47434],
[7.88282, 3.52778],
[9.77052, 9.16828]]
def read_coco_labels():
f = open("./data/coco_classes.txt")
class_names = []
for l in f.readlines():
return class_names
class_names = read_coco_labels()
Help functions for YOLOv2
import random
import colorsys
import cv2
import numpy as np
############## preprocess image ##################
def preprocess_image(image, image_size=(416, 416)):
"""Preprocess a image to inference"""
image_cp = np.copy(image).astype(np.float32)
# resize the image
image_rgb = cv2.cvtColor(image_cp, cv2.COLOR_BGR2RGB)
image_resized = cv2.resize(image_rgb, image_size)
# normalize
image_normalized = image_resized.astype(np.float32) / 255.0
# expand the batch_size dim
# 1, 416, 416, 3
image_expanded = np.expand_dims(image_normalized, axis=0)
return image_expanded
def postprocess(bboxes, obj_probs, class_probs, image_shape=(416, 416),
"""post process the detection results"""
# 1,169,5,4 -> 1x169x5,4
bboxes = np.reshape(bboxes, [-1, 4])
bboxes[:, 0::2] *= float(image_shape[1]) # for xmin,xmax (*image_shape[1]),若是416,则在输出时需要乘宽比例系数,因为比例都是相同的
bboxes[:, 1::2] *= float(image_shape[0]) # for ymin,ymax (*image_shape[0]),若是416,则在输出时需要乘高比例系数,因为比例都是相同的
bboxes = bboxes.astype(np.int32)
# clip the bboxs
# 超出或不足图像边界的像素点重置
bbox_ref = [0, 0, image_shape[1] - 1, image_shape[0] - 1]
bboxes = bboxes_clip(bbox_ref, bboxes)
# 1,169,5,1 -> 1*169*5*1
obj_probs = np.reshape(obj_probs, [-1])
# 1,169,5,80 -> 1*169*5*1,80
class_probs = np.reshape(class_probs, [len(obj_probs), -1])
# 1*169*5*1,80 -> 1*169*5*1 找到最大值的索引
class_inds = np.argmax(class_probs, axis=1)
# 1*169*5*1 对应真实标签的条件概率
class_probs = class_probs[np.arange(len(obj_probs)), class_inds]
# 1*169*5*1 条件概率*IOU = 得分
scores = obj_probs * class_probs
# filter bboxes with scores > threshold
# 大于阈值的保留
keep_inds = scores > threshold
bboxes = bboxes[keep_inds]
scores = scores[keep_inds]
class_inds = class_inds[keep_inds]
# 从大到小排列top_k
class_inds, scores, bboxes = bboxes_sort(class_inds, scores, bboxes)
# nms (所有类别一视同仁!!!有问题???)
class_inds, scores, bboxes = bboxes_nms(class_inds, scores, bboxes)
return bboxes, scores, class_inds
def draw_detection(im, bboxes, scores, cls_inds, labels, thr=0.3):
# for display
# Generate colors for drawing bounding boxes.
hsv_tuples = [(x / float(len(labels)), 1., 1.)
for x in range(len(labels))]
colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples))
colors = list(
map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)),
random.seed(10101) # Fixed seed for consistent colors across runs.
random.shuffle(colors) # Shuffle colors to decorrelate adjacent classes.
random.seed(None) # Reset seed to default.
# draw image
imgcv = np.copy(im)
h, w, _ = imgcv.shape
for i, box in enumerate(bboxes):
if scores[i] < thr:
cls_indx = cls_inds[i]
thick = int((h + w) / 300)
(box[0], box[1]), (box[2], box[3]),
colors[cls_indx], thick)
mess = '%s: %.3f' % (labels[cls_indx], scores[i])
if box[1] < 20:
text_loc = (box[0] + 2, box[1] + 15)
text_loc = (box[0], box[1] - 10)
cv2.putText(imgcv, mess, text_loc,
cv2.FONT_HERSHEY_SIMPLEX, 1e-3 * h, colors[cls_indx], thick // 3)
return imgcv
############## process bboxes ##################
def bboxes_clip(bbox_ref, bboxes):
"""Clip bounding boxes with respect to reference bbox.
bboxes = np.copy(bboxes)
bboxes = np.transpose(bboxes)
bbox_ref = np.transpose(bbox_ref)
bboxes[0] = np.maximum(bboxes[0], bbox_ref[0])
bboxes[1] = np.maximum(bboxes[1], bbox_ref[1])
bboxes[2] = np.minimum(bboxes[2], bbox_ref[2])
bboxes[3] = np.minimum(bboxes[3], bbox_ref[3])
bboxes = np.transpose(bboxes)
return bboxes
def bboxes_sort(classes, scores, bboxes, top_k=400):
"""Sort bounding boxes by decreasing order and keep only the top_k
# if priority_inside:
# inside = (bboxes[:, 0] > margin) & (bboxes[:, 1] > margin) & \
# (bboxes[:, 2] < 1-margin) & (bboxes[:, 3] < 1-margin)
# idxes = np.argsort(-scores)
# inside = inside[idxes]
# idxes = np.concatenate([idxes[inside], idxes[~inside]])
idxes = np.argsort(-scores)
classes = classes[idxes][:top_k]
scores = scores[idxes][:top_k]
bboxes = bboxes[idxes][:top_k]
return classes, scores, bboxes
def bboxes_iou(bboxes1, bboxes2):
"""Computing iou between bboxes1 and bboxes2.
Note: bboxes1 and bboxes2 can be multi-dimensional, but should broacastable.
bboxes1 = np.transpose(bboxes1)
bboxes2 = np.transpose(bboxes2)
# Intersection bbox and volume.
int_ymin = np.maximum(bboxes1[0], bboxes2[0])
int_xmin = np.maximum(bboxes1[1], bboxes2[1])
int_ymax = np.minimum(bboxes1[2], bboxes2[2])
int_xmax = np.minimum(bboxes1[3], bboxes2[3])
int_h = np.maximum(int_ymax - int_ymin, 0.)
int_w = np.maximum(int_xmax - int_xmin, 0.)
int_vol = int_h * int_w
# Union volume.
vol1 = (bboxes1[2] - bboxes1[0]) * (bboxes1[3] - bboxes1[1])
vol2 = (bboxes2[2] - bboxes2[0]) * (bboxes2[3] - bboxes2[1])
iou = int_vol / (vol1 + vol2 - int_vol)
return iou
def bboxes_nms(classes, scores, bboxes, nms_threshold=0.5):
"""Apply non-maximum selection to bounding boxes.
keep_bboxes = np.ones(scores.shape, dtype=np.bool)
for i in range(scores.size-1):
if keep_bboxes[i]:
# Computer overlap with bboxes which are following.
overlap = bboxes_iou(bboxes[i], bboxes[(i+1):])
# Overlap threshold for keeping + checking part of the same class
keep_overlap = np.logical_or(overlap < nms_threshold, classes[(i+1):] != classes[i])
keep_bboxes[(i+1):] = np.logical_and(keep_bboxes[(i+1):], keep_overlap)
idxes = np.where(keep_bboxes)
return classes[idxes], scores[idxes], bboxes[idxes]
YOLOv2 implemented by Tensorflow, only for predicting
import os
import numpy as np
import tensorflow as tf
######## basic layers #######
def leaky_relu(x):
return tf.nn.leaky_relu(x, alpha=0.1, name="leaky_relu")
# Conv2d
def conv2d(x, filters, size, pad=0, stride=1, batch_normalize=1,
activation=leaky_relu, use_bias=False, name="conv2d"):
if pad > 0:
x = tf.pad(x, [[0, 0], [pad, pad], [pad, pad], [0, 0]])
out = tf.layers.conv2d(x, filters, size, strides=stride, padding="VALID",
activation=None, use_bias=use_bias, name=name)
if batch_normalize == 1:
out = tf.layers.batch_normalization(out, axis=-1, momentum=0.9,
training=False, name=name+"_bn")
if activation:
out = activation(out)
return out
# maxpool2d
def maxpool(x, size=2, stride=2, name="maxpool"):
return tf.layers.max_pooling2d(x, size, stride)
# reorg layer
# 将数据重组,每个块的每个像素进行重组,x0[0] + x1[0] + x2[0] + x3[0]...形成新的特征图
# https://zhuanlan.zhihu.com/p/35325884
def reorg(x, ksize, stride, rate):
return tf.extract_image_patches(x, [1, ksize, ksize, 1],
[1, stride, stride, 1], [1,rate,rate,1], padding="VALID")
def darknet(images, n_last_channels=425):
"""Darknet19 for YOLOv2"""
net = conv2d(images, 32, 3, 1, name="conv1")
net = maxpool(net, name="pool1")
net = conv2d(net, 64, 3, 1, name="conv2")
net = maxpool(net, name="pool2")
net = conv2d(net, 128, 3, 1, name="conv3_1")
net = conv2d(net, 64, 1, name="conv3_2")
net = conv2d(net, 128, 3, 1, name="conv3_3")
net = maxpool(net, name="pool3")
net = conv2d(net, 256, 3, 1, name="conv4_1")
net = conv2d(net, 128, 1, name="conv4_2")
net = conv2d(net, 256, 3, 1, name="conv4_3")
net = maxpool(net, name="pool4")
net = conv2d(net, 512, 3, 1, name="conv5_1")
net = conv2d(net, 256, 1, name="conv5_2")
net = conv2d(net, 512, 3, 1, name="conv5_3")
net = conv2d(net, 256, 1, name="conv5_4")
net = conv2d(net, 512, 3, 1, name="conv5_5")
# 1,26,26,512
shortcut = net
net = maxpool(net, name="pool5")
net = conv2d(net, 1024, 3, 1, name="conv6_1")
net = conv2d(net, 512, 1, name="conv6_2")
net = conv2d(net, 1024, 3, 1, name="conv6_3")
net = conv2d(net, 512, 1, name="conv6_4")
net = conv2d(net, 1024, 3, 1, name="conv6_5")
# ---------
net = conv2d(net, 1024, 3, 1, name="conv7_1")
# 1,13,13,1024
net = conv2d(net, 1024, 3, 1, name="conv7_2")
# shortcut
# 1,26,26,512 -> 1,26,26,64
shortcut = conv2d(shortcut, 64, 1, name="conv_shortcut")
# 1,26,26,64 -> 1,13,13,256
shortcut = reorg(shortcut, 2, 2, 1)
# 小物体还需要更精细的特征图
# 1,13,13,256 + 1,13,13,1024 = 1,13,13,1280 (resnet)
net = tf.concat([shortcut, net], axis=-1)
net = conv2d(net, 1024, 3, 1, name="conv8")
# detection layer
net = conv2d(net, n_last_channels, 1, batch_normalize=0,
activation=None, use_bias=True, name="conv_dec")
return net
if __name__ == "__main__":
x = tf.random_normal([1, 416, 416, 3])
model = darknet(x)
saver = tf.train.Saver()
with tf.Session() as sess:
saver.restore(sess, "./checkpoint_dir/yolo2_coco.ckpt")
Detection ops for Yolov2
import tensorflow as tf
import numpy as np
def decode(detection_feat, feat_sizes=(13, 13), num_classes=80,
"""decode from the detection feature"""
H, W = feat_sizes
num_anchors = len(anchors)
# 1,13,13,425 -> 1,169,5,85
detetion_results = tf.reshape(detection_feat, [-1, H * W, num_anchors,
num_classes + 5])
# 四个值在线计算,无需反馈计算
bbox_xy = tf.nn.sigmoid(detetion_results[:, :, :, 0:2])
bbox_wh = tf.exp(detetion_results[:, :, :, 2:4])
obj_probs = tf.nn.sigmoid(detetion_results[:, :, :, 4])
class_probs = tf.nn.softmax(detetion_results[:, :, :, 5:])
anchors = tf.constant(anchors, dtype=tf.float32)
height_ind = tf.range(H, dtype=tf.float32)
width_ind = tf.range(W, dtype=tf.float32)
x_offset, y_offset = tf.meshgrid(height_ind, width_ind)
x_offset = tf.reshape(x_offset, [1, -1, 1])# 1,169,1
y_offset = tf.reshape(y_offset, [1, -1, 1])# 1,169,1
# decode
# x,y相对各自单元格的坐标,与yolov1一样
bbox_x = (bbox_xy[:, :, :, 0] + x_offset) / W
bbox_y = (bbox_xy[:, :, :, 1] + y_offset) / H
# w,h相对各自的anchor而言,与yolov1不同
bbox_w = bbox_wh[:, :, :, 0] * anchors[:, 0] / W * 0.5
bbox_h = bbox_wh[:, :, :, 1] * anchors[:, 1] / H * 0.5
# bboxes相对13x13的特征图左上右下坐标
# 1,169,5,4
bboxes = tf.stack([bbox_x - bbox_w, bbox_y - bbox_h,
bbox_x + bbox_w, bbox_y + bbox_h], axis=3)
return bboxes, obj_probs, class_probs