前言
源码版本是B站UP:霹雳啪啦的yolov3版本
https://github.com/WZMIAOMIAO/deep-learning-for-image-processing
主要讲解NMS,scale_coords,draw_box三个部分的源码解析
NMS源码我单独发了一篇博客:YoloV3-SPP NMS源码详解
预测模块
源码
import os
import json
import time
import torch
import cv2
import numpy as np
from matplotlib import pyplot as plt
from build_utils import img_utils, torch_utils, utils
from models import Darknet
from draw_box_utils import draw_box
def main():
img_size = 512 # 必须是32的整数倍 [416, 512, 608]
cfg = "cfg/my_yolov3.cfg" # 改成生成的.cfg文件
weights = "weights/629cls2best.pt" # 改成自己训练好的权重文件
json_path = "./WiderPerson/my_yolo_dataset/pedestrian_classes.json" # json标签文件
img_path = "test.jpg"
assert os.path.exists(cfg), "cfg file {} dose not exist.".format(cfg)
assert os.path.exists(weights), "weights file {} dose not exist.".format(weights)
assert os.path.exists(json_path), "json file {} dose not exist.".format(json_path)
assert os.path.exists(img_path), "image file {} dose not exist.".format(img_path)
json_file = open(json_path, 'r')
class_dict = json.load(json_file)
category_index = {v: k for k, v in class_dict.items()}
input_size = (img_size, img_size)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = Darknet(cfg, img_size)
model.load_state_dict(torch.load(weights, map_location=device)["model"])
model.to(device)
# 禁止网络进行梯度跟踪
model.eval()
with torch.no_grad():
# init 传入空图进行初始化模型载入
img = torch.zeros((1, 3, img_size, img_size), device=device)
model(img)
img_o = cv2.imread(img_path) # BGR
assert img_o is not None, "Image Not Found " + img_path
# 输入进行缩放,auto可以补图像空缺的部分
img = img_utils.letterbox(img_o, new_shape=input_size, auto=True, color=(0, 0, 0))[0]
# Convert
# img[:,:,::-1]改变了BGR-》RGB,transpose改变数据通道顺序,将416X416X3改变为3X416X416
img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416
# 图片设置内存存储状态为连续存储状态
img = np.ascontiguousarray(img)
# 图片转化为tensor格式
img = torch.from_numpy(img).to(device).float()
img /= 255.0 # scale (0, 255) to (0, 1)
# 新增batch维度
img = img.unsqueeze(0) # add batch dimension
# 网络进行正向传播,t为时间差,pred为返回结果
t1 = torch_utils.time_synchronized()
pred = model(img)[0] # only get inference result
t2 = torch_utils.time_synchronized()
print(t2 - t1)
# 非极大值抑制处理
pred = utils.non_max_suppression(pred, conf_thres=0.1, iou_thres=0.6, multi_label=True)[0]
t3 = time.time()
print(t3 - t2)
if pred is None:
print("No target detected.")
exit(0)
# process detections
# 将得到的预测数据,预测边界框映射到原尺度大小
pred[:, :4] = utils.scale_coords(img.shape[2:], pred[:, :4], img_o.shape).round()
print(pred.shape)
# 取前4个坐标参数
bboxes = pred[:, :4].detach().cpu().numpy()
# 取第五个confidence
scores = pred[:, 4].detach().cpu().numpy()
classes = pred[:, 5].detach().cpu().numpy().astype(np.int) + 1
img_o = draw_box(img_o[:, :, ::-1], bboxes, classes, scores, category_index)
plt.imshow(img_o)
plt.show()
img_o.save("test_result.jpg")
if __name__ == "__main__":
main()
letter box缩放图片
源码
def letterbox(img: np.ndarray,
new_shape=(416, 416),
color=(114, 114, 114),
auto=True,
scale_fill=False,
scale_up=True):
"""
将图片缩放调整到指定大小
:param img: 输入的图像numpy格式
:param new_shape: 输入网络的shape
:param color: padding用什么颜色填充
:param auto:原图比例不变
:param scale_fill: 简单粗暴缩放到指定大小
:param scale_up: false时,对于img最长边小于指定边长时,不改变img的宽高
:return:
"""
shape = img.shape[:2] # [h, w]
if isinstance(new_shape, int):
new_shape = (new_shape, new_shape)
# scale ratio (new / old)
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
if not scale_up: # only scale down, do not scale up (for better test mAP) 对于大于指定输入大小的图片进行缩放,小于的不变
r = min(r, 1.0)
# compute padding
ratio = r, r # width, height ratios
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
if auto: # minimun rectangle 保证原图比例不变,将图像最大边缩放到指定大小
# 这里的取余操作可以保证padding后的图片是32的整数倍(416x416),如果是(512x512)可以保证是64的整数倍
dw, dh = np.mod(dw, 64), np.mod(dh, 64) # wh padding
elif scale_fill: # stretch 简单粗暴的将图片缩放到指定尺寸
dw, dh = 0, 0
new_unpad = new_shape
ratio = new_shape[0] / shape[1], new_shape[1] / shape[0] # wh ratios
dw /= 2 # divide padding into 2 sides 将padding分到上下,左右两侧
dh /= 2
# shape:[h, w] new_unpad:[w, h]
if shape[::-1] != new_unpad:
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) # 计算上下两侧的padding
left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) # 计算左右两侧的padding
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border
return img, ratio, (dw, dh)
解析
def letterbox(img: np.ndarray,
new_shape=(416, 416),
color=(114, 114, 114),
auto=True,
scale_fill=False,
scale_up=True):
"""
将图片缩放调整到指定大小
:param img: 输入的图像numpy格式
:param new_shape: 输入网络的shape
:param color: padding用什么颜色填充
:param auto:原图比例不变
:param scale_fill: 简单粗暴缩放到指定大小
:param scale_up: false时,对于img最长边小于指定边长时,不改变img的宽高
:return:
"""
shape = img.shape[:2] # [h, w]
if isinstance(new_shape, int):
new_shape = (new_shape, new_shape)
对于传进来的new_shape,判断是不是一个int,如果为int,则修改为元组,主要防止传参类型不一致。
# scale ratio (new / old)
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
if not scale_up: # only scale down, do not scale up (for better test mAP) 对于大于指定输入大小的图片进行缩放,小于的不变
r = min(r, 1.0)
为了形象说明上述流程,我传入一张img(h,w,3)=img(762,1019,3)的图片
这里传入的new_shape假定为(512,512),可知:
r
=
m
i
n
(
512
i
m
g
.
h
,
512
i
m
g
.
w
)
r=min(\frac{512}{img.h},\frac{512}{img.w})
r=min(img.h512,img.w512)
r表示target图片和传入图片的shape的高宽比例的最小值
r
=
m
i
n
(
512
i
m
g
.
h
,
512
i
m
g
.
w
)
=
512
∗
m
i
n
(
1
i
m
g
.
h
,
1
i
m
g
.
w
)
=
512
m
a
x
(
i
m
g
.
h
,
i
m
g
.
w
)
r=min(\frac{512}{img.h},\frac{512}{img.w})=512\ast min(\frac{1}{img.h},\frac{1}{img.w})=\frac{512}{max(img.h,img.w)}
r=min(img.h512,img.w512)=512∗min(img.h1,img.w1)=max(img.h,img.w)512
由以上分析得,r是target和传入图片的最大边的比例
# compute padding
ratio = r, r # width, height ratios
# 先round四舍五入保留部分小数,再int取整抛弃小数
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
new_unpad写成公式:
n
e
w
_
u
n
p
a
d
=
[
i
m
g
.
w
∗
512
m
a
x
(
i
m
g
.
h
,
i
m
g
.
w
)
,
i
m
g
.
h
∗
512
m
a
x
(
i
m
g
.
h
,
i
m
g
.
w
)
]
new\_unpad=[img.w\ast \frac{512}{max(img.h,img.w)},img.h\ast \frac{512}{max(img.h,img.w)}]
new_unpad=[img.w∗max(img.h,img.w)512,img.h∗max(img.h,img.w)512]
写成如下形式好理解点:
n
e
w
_
u
n
p
a
d
=
[
512
∗
i
m
g
.
w
m
a
x
(
i
m
g
.
h
,
i
m
g
.
w
)
,
512
∗
i
m
g
.
h
m
a
x
(
i
m
g
.
h
,
i
m
g
.
w
)
]
new\_unpad=[512\ast \frac{img.w}{max(img.h,img.w)},512\ast \frac{img.h}{max(img.h,img.w)}]
new_unpad=[512∗max(img.h,img.w)img.w,512∗max(img.h,img.w)img.h]
注:此时new_unpad表示的是shape为
(
1019
,
762
)
(1019,762)
(1019,762)的img要scale成的初始target的shape为
n
e
w
_
u
n
p
a
d
new\_unpad
new_unpad,并且保持了原图比例不变。
dw,dh写成公式:
d
w
=
512
−
512
∗
i
m
g
.
w
m
a
x
(
i
m
g
.
h
,
i
m
g
.
w
)
dw=512-512\ast \frac{img.w}{max(img.h,img.w)}
dw=512−512∗max(img.h,img.w)img.w
d
h
=
512
−
512
∗
i
m
g
.
h
m
a
x
(
i
m
g
.
h
,
i
m
g
.
w
)
dh=512-512\ast \frac{img.h}{max(img.h,img.w)}
dh=512−512∗max(img.h,img.w)img.h
此时dw,dh表示target的宽高边中scale_down的像素(缩小的像素)
此时target的shape(即new_shape)为:
传入图片的shape(h,w)为:
此时new_unpad为:
dw:
dh:
if auto: # minimun rectangle 保证原图比例不变,将图像最大边缩放到指定大小
# 这里的取余操作可以保证padding后的图片是32的整数倍(416x416),如果是(512x512)可以保证是64的整数倍
dw, dh = np.mod(dw, 64), np.mod(dh, 64) # wh padding
elif scale_fill: # stretch 简单粗暴的将图片缩放到指定尺寸
dw, dh = 0, 0
new_unpad = new_shape
ratio = new_shape[0] / shape[1], new_shape[1] / shape[0] # wh ratios
这里采用auto=True,保持原图比例不变:
dw,dh取模64的余数,得dw=0,dh=1
注:这里模型采用的512X512指定预测,mod64是可以的,512mod64=0;但如果采用416X416指定预测,那么这个mod64就有问题了,个人觉得这是为512X512指定预测设计的参数,方便计算速度?大规模推理?这个参数值得留意,会随着模型修改需要改动。
dw /= 2 # divide padding into 2 sides 将padding分到上下,左右两侧
dh /= 2
得dw=dh=0.5
# shape:[h, w] new_unpad:[w, h]
if shape[::-1] != new_unpad:
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) # 计算上下两侧的padding
left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) # 计算左右两侧的padding
如果img的宽高和new_unpad宽高不一致,那么将resize这个img从(1019,762)resize成(512,383)
其中其中一边未必是32的倍数,dh取余后是1,那么对于383,只有补充这个余数才能称为32的倍数(512已经是32倍数了)。
这个0.1是为了保证padding值是正确的,分两种情况
- 当dh或者dw为整数时,0.1没有起到作用
- 当dh或者dw为带有小数时,0.1的作用会优先让bottom相比top多padding一个像素单位,left和right类同
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border
return img, ratio, (dw, dh)
调用cv2的库对img补充边界,补充像素值为(0,0,0),边界像素反向传播时不影响img其他像素的计算
返回img,ratio是target和原img最大边的比例,还有(dw,dh)
使用时目前只使用到第一个返回值img
scale_coords映射尺度
对模型经过NMS后的输出进行尺度映射,流程图简述如下:
源码
def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None):
"""
将预测的坐标信息转换回原图尺度
:param img1_shape: 缩放后的图像尺度
:param coords: 预测的box信息
:param img0_shape: 缩放前的图像尺度
:param ratio_pad: 缩放过程中的缩放比例以及pad
:return:
"""
# Rescale coords (xyxy) from img1_shape to img0_shape
if ratio_pad is None: # calculate from img0_shape
gain = max(img1_shape) / max(img0_shape) # gain = old / new
pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding
else:
gain = ratio_pad[0][0]
pad = ratio_pad[1]
coords[:, [0, 2]] -= pad[0] # x padding
coords[:, [1, 3]] -= pad[1] # y padding
coords[:, :4] /= gain
clip_coords(coords, img0_shape)
return coords
def clip_coords(boxes, img_shape):
# Clip bounding xyxy bounding boxes to image shape (height, width)
boxes[:, 0].clamp_(0, img_shape[1]) # x1
boxes[:, 1].clamp_(0, img_shape[0]) # y1
boxes[:, 2].clamp_(0, img_shape[1]) # x2
boxes[:, 3].clamp_(0, img_shape[0]) # y2
解析
# Rescale coords (xyxy) from img1_shape to img0_shape
if ratio_pad is None: # calculate from img0_shape
gain = max(img1_shape) / max(img0_shape) # gain = old / new
pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding
gain为放缩后最长边和原图最长边的比例,其实这个比例也是它们短边的比例,因为短边也会对根据gain这个最长边比例去缩放。这个gain和letterbox里的r是同一个值。
pad与letterbox中的dw,dh值几乎一致
coords[:, [0, 2]] -= pad[0] # x padding
对预测的横坐标恢复padding
coords[:, [1, 3]] -= pad[1] # y padding
对预测的纵坐标回复padding
coords[:, :4] /= gain
对所有坐标恢复到img0尺度
clip_coords(coords, img0_shape)
将恢复的预测坐标coords传入clip_coords函数,如下:
def clip_coords(boxes, img_shape):
# Clip bounding xyxy bounding boxes to image shape (height, width)
boxes[:, 0].clamp_(0, img_shape[1]) # x1
boxes[:, 1].clamp_(0, img_shape[0]) # y1
boxes[:, 2].clamp_(0, img_shape[1]) # x2
boxes[:, 3].clamp_(0, img_shape[0]) # y2
对所有预测框坐标范围进行约束。
return coords
最后返回coords
scale_coords总结
源码没有将ratio和pad传入scale_coords函数中,当然可以实现这部分,letterbox的返回值包含了所需的参数,这里通过获得的缩放图的确可以对原图进行反求缩放比gain和pad,细心的朋友会发现,这里反求用的是缩放后的图为基准去求比值,并且这个缩放后的图是经过letterbox缩放和padding操作得到的,而在letterbox那的r比值是没有经过padding操作求得,所以数值上会有一丢丢差距,具体可以看我debug:
在letterbox那pad是(0,0.5)
draw_box
源码
def filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map):
for i in range(boxes.shape[0]):
if scores[i] > thresh:
box = tuple(boxes[i].tolist()) # numpy -> list -> tuple
if classes[i] in category_index.keys():
class_name = category_index[classes[i]]
else:
class_name = 'N/A'
display_str = str(class_name)
display_str = '{}: {}%'.format(display_str, int(100 * scores[i]))
box_to_display_str_map[box].append(display_str)
box_to_color_map[box] = STANDARD_COLORS[
classes[i] % len(STANDARD_COLORS)]
else:
break # 网络输出概率已经排序过,当遇到一个不满足后面的肯定不满足
def draw_box(image, boxes, classes, scores, category_index, thresh=0.1, line_thickness=3):
box_to_display_str_map = collections.defaultdict(list)
box_to_color_map = collections.defaultdict(str)
filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map)
# Draw all boxes onto image.
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
draw = ImageDraw.Draw(image)
im_width, im_height = image.size
for box, color in box_to_color_map.items():
xmin, ymin, xmax, ymax = box
(left, right, top, bottom) = (xmin * 1, xmax * 1,
ymin * 1, ymax * 1)
draw.line([(left, top), (left, bottom), (right, bottom),
(right, top), (left, top)], width=line_thickness, fill=color)
draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color)
return image
def draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color):
try:
font = ImageFont.truetype('arial.ttf', 20)
except IOError:
font = ImageFont.load_default()
# If the total height of the display strings added to the top of the bounding
# box exceeds the top of the image, stack the strings below the bounding box
# instead of above.
display_str_heights = [font.getsize(ds)[1] for ds in box_to_display_str_map[box]]
# Each display_str has a top and bottom margin of 0.05x.
total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights)
if top > total_display_str_height:
text_bottom = top
else:
text_bottom = bottom + total_display_str_height
# Reverse list and print from bottom to top.
for display_str in box_to_display_str_map[box][::-1]:
text_width, text_height = font.getsize(display_str)
margin = np.ceil(0.05 * text_height)
draw.rectangle([(left, text_bottom - text_height - 2 * margin),
(left + text_width, text_bottom)], fill=color)
draw.text((left + margin, text_bottom - text_height - margin),
display_str,
fill='black',
font=font)
text_bottom -= text_height - 2 * margin
解析
def draw_box(image, boxes, classes, scores, category_index, thresh=0.1, line_thickness=3):
box_to_display_str_map = collections.defaultdict(list)
box_to_color_map = collections.defaultdict(str)
image:原图Img_o
boxes:预测框的前4个坐标x1y1x2y2
classes:预测框的类(逻辑符号从1开始计算)
scores:cls_conf类置信度
category_index:分类字典
thres:cls_conf阈值
line_thickness:box的边缘线宽像素为3
filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map)
对低类置信度cls_conf的预测框进行筛除,这一步只对NMS中multi_label为fasle或者单分类的预测模式有作用,因为经过NMS时在multi_label为True并且多分类时会对cls_conf进行筛选。
# Draw all boxes onto image.
if isinstance(image, np.ndarray):
image = Image.fromarray(image)
将ndarray对象转化为image对象
draw = ImageDraw.Draw(image)
初始化ImageDraw对象
im_width, im_height = image.size
for box, color in box_to_color_map.items():
xmin, ymin, xmax, ymax = box
(left, right, top, bottom) = (xmin * 1, xmax * 1,
ymin * 1, ymax * 1)
draw.line([(left, top), (left, bottom), (right, bottom),
(right, top), (left, top)], width=line_thickness, fill=color)
读取box_to_color_map字典在draw对象画出对应坐标的线框
draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color)
draw_text的代码详见:draw_text