Yolo-V3-SPP 预测模块

最新推荐文章于 2024-07-19 14:15:46 发布

小哈蒙德

最新推荐文章于 2024-07-19 14:15:46 发布

阅读量933

点赞数 2

分类专栏：目标检测文章标签：深度学习 python 目标检测 Yolo

本文链接：https://blog.csdn.net/qq_38109282/article/details/119759133

版权

目标检测专栏收录该内容

15 篇文章

订阅专栏

前言

源码版本是B站UP：霹雳啪啦的yolov3版本
https://github.com/WZMIAOMIAO/deep-learning-for-image-processing
主要讲解NMS,scale_coords,draw_box三个部分的源码解析

请添加图片描述

NMS源码我单独发了一篇博客:YoloV3-SPP NMS源码详解

预测模块

源码

import os
import json
import time
import torch
import cv2
import numpy as np
from matplotlib import pyplot as plt
from build_utils import img_utils, torch_utils, utils
from models import Darknet
from draw_box_utils import draw_box
def main():
    img_size = 512  # 必须是32的整数倍 [416, 512, 608]
    cfg = "cfg/my_yolov3.cfg"  # 改成生成的.cfg文件
    weights = "weights/629cls2best.pt"  # 改成自己训练好的权重文件
    json_path = "./WiderPerson/my_yolo_dataset/pedestrian_classes.json"  # json标签文件
    img_path = "test.jpg"
    assert os.path.exists(cfg), "cfg file {} dose not exist.".format(cfg)
    assert os.path.exists(weights), "weights file {} dose not exist.".format(weights)
    assert os.path.exists(json_path), "json file {} dose not exist.".format(json_path)
    assert os.path.exists(img_path), "image file {} dose not exist.".format(img_path)
    json_file = open(json_path, 'r')
    class_dict = json.load(json_file)
    category_index = {v: k for k, v in class_dict.items()}
    input_size = (img_size, img_size)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = Darknet(cfg, img_size)
    model.load_state_dict(torch.load(weights, map_location=device)["model"])
    model.to(device)
    # 禁止网络进行梯度跟踪
    model.eval()
    with torch.no_grad():
        # init 传入空图进行初始化模型载入
        img = torch.zeros((1, 3, img_size, img_size), device=device)
        model(img)
        img_o = cv2.imread(img_path)  # BGR
        assert img_o is not None, "Image Not Found " + img_path
        # 输入进行缩放，auto可以补图像空缺的部分
        img = img_utils.letterbox(img_o, new_shape=input_size, auto=True, color=(0, 0, 0))[0]
        # Convert
        # img[:,:,::-1]改变了BGR-》RGB，transpose改变数据通道顺序，将416X416X3改变为3X416X416
        img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416
        # 图片设置内存存储状态为连续存储状态
        img = np.ascontiguousarray(img)

        # 图片转化为tensor格式
        img = torch.from_numpy(img).to(device).float()
        img /= 255.0  # scale (0, 255) to (0, 1)
        # 新增batch维度
        img = img.unsqueeze(0)  # add batch dimension
        # 网络进行正向传播，t为时间差，pred为返回结果
        t1 = torch_utils.time_synchronized()
        pred = model(img)[0]  # only get inference result
        t2 = torch_utils.time_synchronized()
        print(t2 - t1)
        # 非极大值抑制处理
        pred = utils.non_max_suppression(pred, conf_thres=0.1, iou_thres=0.6, multi_label=True)[0]
        t3 = time.time()
        print(t3 - t2)
        if pred is None:
            print("No target detected.")
            exit(0)
        # process detections
        # 将得到的预测数据，预测边界框映射到原尺度大小
        pred[:, :4] = utils.scale_coords(img.shape[2:], pred[:, :4], img_o.shape).round()
        print(pred.shape)
        # 取前4个坐标参数
        bboxes = pred[:, :4].detach().cpu().numpy()
        # 取第五个confidence
        scores = pred[:, 4].detach().cpu().numpy()
        classes = pred[:, 5].detach().cpu().numpy().astype(np.int) + 1

        img_o = draw_box(img_o[:, :, ::-1], bboxes, classes, scores, category_index)
        plt.imshow(img_o)
        plt.show()
        img_o.save("test_result.jpg")
if __name__ == "__main__":
    main()

letter box缩放图片

源码

def letterbox(img: np.ndarray,
              new_shape=(416, 416),
              color=(114, 114, 114),
              auto=True,
              scale_fill=False,
              scale_up=True):
    """
    将图片缩放调整到指定大小
    :param img: 输入的图像numpy格式
    :param new_shape: 输入网络的shape
    :param color: padding用什么颜色填充
    :param auto:原图比例不变
    :param scale_fill: 简单粗暴缩放到指定大小
    :param scale_up:  false时，对于img最长边小于指定边长时，不改变img的宽高
    :return:
    """

    shape = img.shape[:2]  # [h, w]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scale_up:  # only scale down, do not scale up (for better test mAP) 对于大于指定输入大小的图片进行缩放,小于的不变
        r = min(r, 1.0)

    # compute padding
    ratio = r, r  # width, height ratios
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
    if auto:  # minimun rectangle 保证原图比例不变，将图像最大边缩放到指定大小
        # 这里的取余操作可以保证padding后的图片是32的整数倍(416x416)，如果是(512x512)可以保证是64的整数倍
        dw, dh = np.mod(dw, 64), np.mod(dh, 64)  # wh padding
    elif scale_fill:  # stretch 简单粗暴的将图片缩放到指定尺寸
        dw, dh = 0, 0
        new_unpad = new_shape
        ratio = new_shape[0] / shape[1], new_shape[1] / shape[0]  # wh ratios

    dw /= 2  # divide padding into 2 sides 将padding分到上下，左右两侧
    dh /= 2

    # shape:[h, w]  new_unpad:[w, h]
    if shape[::-1] != new_unpad:
        img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))  # 计算上下两侧的padding
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))  # 计算左右两侧的padding

    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return img, ratio, (dw, dh)

解析

def letterbox(img: np.ndarray,
              new_shape=(416, 416),
              color=(114, 114, 114),
              auto=True,
              scale_fill=False,
              scale_up=True):
    """
    将图片缩放调整到指定大小
    :param img: 输入的图像numpy格式
    :param new_shape: 输入网络的shape
    :param color: padding用什么颜色填充
    :param auto:原图比例不变
    :param scale_fill: 简单粗暴缩放到指定大小
    :param scale_up:  false时，对于img最长边小于指定边长时，不改变img的宽高
    :return:
    """
    shape = img.shape[:2]  # [h, w]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

对于传进来的new_shape，判断是不是一个int，如果为int，则修改为元组，主要防止传参类型不一致。

    # scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scale_up:  # only scale down, do not scale up (for better test mAP) 对于大于指定输入大小的图片进行缩放,小于的不变
        r = min(r, 1.0)

为了形象说明上述流程，我传入一张img(h,w,3)=img(762,1019,3)的图片
这里传入的new_shape假定为(512,512),可知：
$r=min(\frac{512}{img.h},\frac{512}{img.w})$
r表示target图片和传入图片的shape的高宽比例的最小值
$r=min(\frac{512}{img.h},\frac{512}{img.w})=512\ast min(\frac{1}{img.h},\frac{1}{img.w})=\frac{512}{max(img.h,img.w)}$

由以上分析得，r是target和传入图片的最大边的比例

    # compute padding
    ratio = r, r  # width, height ratios
    # 先round四舍五入保留部分小数，再int取整抛弃小数
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding

new_unpad写成公式：
$new\_unpad=[img.w\ast \frac{512}{max(img.h,img.w)},img.h\ast \frac{512}{max(img.h,img.w)}]$
写成如下形式好理解点：
$new\_unpad=[512\ast \frac{img.w}{max(img.h,img.w)},512\ast \frac{img.h}{max(img.h,img.w)}]$
注：此时new_unpad表示的是shape为 $(1019, 762)$ 的img要scale成的初始target的shape为 $new\_unpad$ ，并且保持了原图比例不变。
dw,dh写成公式：
$dw=512-512\ast \frac{img.w}{max(img.h,img.w)}$
$dh=512-512\ast \frac{img.h}{max(img.h,img.w)}$
此时dw,dh表示target的宽高边中scale_down的像素（缩小的像素）

此时target的shape(即new_shape)为：
在这里插入图片描述
传入图片的shape(h,w)为：

此时new_unpad为：

dw:

dh：

    if auto:  # minimun rectangle 保证原图比例不变，将图像最大边缩放到指定大小
        # 这里的取余操作可以保证padding后的图片是32的整数倍(416x416)，如果是(512x512)可以保证是64的整数倍
        dw, dh = np.mod(dw, 64), np.mod(dh, 64)  # wh padding
    elif scale_fill:  # stretch 简单粗暴的将图片缩放到指定尺寸
        dw, dh = 0, 0
        new_unpad = new_shape
        ratio = new_shape[0] / shape[1], new_shape[1] / shape[0]  # wh ratios

这里采用auto=True,保持原图比例不变：
dw,dh取模64的余数，得dw=0,dh=1
注：这里模型采用的512X512指定预测，mod64是可以的，512mod64=0；但如果采用416X416指定预测，那么这个mod64就有问题了，个人觉得这是为512X512指定预测设计的参数，方便计算速度？大规模推理？这个参数值得留意，会随着模型修改需要改动。

    dw /= 2  # divide padding into 2 sides 将padding分到上下，左右两侧
    dh /= 2

得dw=dh=0.5

    # shape:[h, w]  new_unpad:[w, h]
    if shape[::-1] != new_unpad:
        img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))  # 计算上下两侧的padding
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))  # 计算左右两侧的padding

如果img的宽高和new_unpad宽高不一致，那么将resize这个img从(1019,762)resize成(512,383)
其中其中一边未必是32的倍数，dh取余后是1，那么对于383，只有补充这个余数才能称为32的倍数（512已经是32倍数了）。

这个0.1是为了保证padding值是正确的，分两种情况

当dh或者dw为整数时，0.1没有起到作用
当dh或者dw为带有小数时，0.1的作用会优先让bottom相比top多padding一个像素单位，left和right类同

    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return img, ratio, (dw, dh)

调用cv2的库对img补充边界，补充像素值为(0,0,0)，边界像素反向传播时不影响img其他像素的计算
返回img，ratio是target和原img最大边的比例，还有(dw,dh)
使用时目前只使用到第一个返回值img

scale_coords映射尺度

对模型经过NMS后的输出进行尺度映射，流程图简述如下：
在这里插入图片描述

源码

def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None):
    """
    将预测的坐标信息转换回原图尺度
    :param img1_shape: 缩放后的图像尺度
    :param coords: 预测的box信息
    :param img0_shape: 缩放前的图像尺度
    :param ratio_pad: 缩放过程中的缩放比例以及pad
    :return:
    """
    # Rescale coords (xyxy) from img1_shape to img0_shape
    if ratio_pad is None:  # calculate from img0_shape
        gain = max(img1_shape) / max(img0_shape)  # gain  = old / new
        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
    else:
        gain = ratio_pad[0][0]
        pad = ratio_pad[1]

    coords[:, [0, 2]] -= pad[0]  # x padding
    coords[:, [1, 3]] -= pad[1]  # y padding
    coords[:, :4] /= gain
    clip_coords(coords, img0_shape)
    return coords
    
def clip_coords(boxes, img_shape):
    # Clip bounding xyxy bounding boxes to image shape (height, width)
    boxes[:, 0].clamp_(0, img_shape[1])  # x1
    boxes[:, 1].clamp_(0, img_shape[0])  # y1
    boxes[:, 2].clamp_(0, img_shape[1])  # x2
    boxes[:, 3].clamp_(0, img_shape[0])  # y2

解析

    # Rescale coords (xyxy) from img1_shape to img0_shape
    if ratio_pad is None:  # calculate from img0_shape
        gain = max(img1_shape) / max(img0_shape)  # gain  = old / new
        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding

gain为放缩后最长边和原图最长边的比例，其实这个比例也是它们短边的比例，因为短边也会对根据gain这个最长边比例去缩放。这个gain和letterbox里的r是同一个值。

pad与letterbox中的dw，dh值几乎一致

    coords[:, [0, 2]] -= pad[0]  # x padding

对预测的横坐标恢复padding

    coords[:, [1, 3]] -= pad[1]  # y padding

对预测的纵坐标回复padding

    coords[:, :4] /= gain

对所有坐标恢复到img0尺度

    clip_coords(coords, img0_shape)

将恢复的预测坐标coords传入clip_coords函数，如下：

def clip_coords(boxes, img_shape):
    # Clip bounding xyxy bounding boxes to image shape (height, width)
    boxes[:, 0].clamp_(0, img_shape[1])  # x1
    boxes[:, 1].clamp_(0, img_shape[0])  # y1
    boxes[:, 2].clamp_(0, img_shape[1])  # x2
    boxes[:, 3].clamp_(0, img_shape[0])  # y2

对所有预测框坐标范围进行约束。

    return coords

最后返回coords

scale_coords总结

源码没有将ratio和pad传入scale_coords函数中，当然可以实现这部分，letterbox的返回值包含了所需的参数，这里通过获得的缩放图的确可以对原图进行反求缩放比gain和pad，细心的朋友会发现，这里反求用的是缩放后的图为基准去求比值，并且这个缩放后的图是经过letterbox缩放和padding操作得到的，而在letterbox那的r比值是没有经过padding操作求得，所以数值上会有一丢丢差距，具体可以看我debug:
在这里插入图片描述
在letterbox那pad是(0,0.5)

draw_box

源码

def filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map):
    for i in range(boxes.shape[0]):
        if scores[i] > thresh:
            box = tuple(boxes[i].tolist())  # numpy -> list -> tuple
            if classes[i] in category_index.keys():
                class_name = category_index[classes[i]]
            else:
                class_name = 'N/A'
            display_str = str(class_name)
            display_str = '{}: {}%'.format(display_str, int(100 * scores[i]))
            box_to_display_str_map[box].append(display_str)
            box_to_color_map[box] = STANDARD_COLORS[
                classes[i] % len(STANDARD_COLORS)]
        else:
            break  # 网络输出概率已经排序过，当遇到一个不满足后面的肯定不满足
def draw_box(image, boxes, classes, scores, category_index, thresh=0.1, line_thickness=3):
    box_to_display_str_map = collections.defaultdict(list)
    box_to_color_map = collections.defaultdict(str)

    filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map)

    # Draw all boxes onto image.
    if isinstance(image, np.ndarray):
        image = Image.fromarray(image)
    draw = ImageDraw.Draw(image)
    im_width, im_height = image.size
    for box, color in box_to_color_map.items():
        xmin, ymin, xmax, ymax = box
        (left, right, top, bottom) = (xmin * 1, xmax * 1,
                                      ymin * 1, ymax * 1)
        draw.line([(left, top), (left, bottom), (right, bottom),
                   (right, top), (left, top)], width=line_thickness, fill=color)
        draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color)
    return image

def draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color):
    try:
        font = ImageFont.truetype('arial.ttf', 20)
    except IOError:
        font = ImageFont.load_default()

    # If the total height of the display strings added to the top of the bounding
    # box exceeds the top of the image, stack the strings below the bounding box
    # instead of above.
    display_str_heights = [font.getsize(ds)[1] for ds in box_to_display_str_map[box]]
    # Each display_str has a top and bottom margin of 0.05x.
    total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights)

    if top > total_display_str_height:
        text_bottom = top
    else:
        text_bottom = bottom + total_display_str_height
    # Reverse list and print from bottom to top.
    for display_str in box_to_display_str_map[box][::-1]:
        text_width, text_height = font.getsize(display_str)
        margin = np.ceil(0.05 * text_height)
        draw.rectangle([(left, text_bottom - text_height - 2 * margin),
                        (left + text_width, text_bottom)], fill=color)
        draw.text((left + margin, text_bottom - text_height - margin),
                  display_str,
                  fill='black',
                  font=font)
        text_bottom -= text_height - 2 * margin

解析

def draw_box(image, boxes, classes, scores, category_index, thresh=0.1, line_thickness=3):
    box_to_display_str_map = collections.defaultdict(list)
    box_to_color_map = collections.defaultdict(str)

image：原图Img_o
boxes：预测框的前4个坐标x1y1x2y2
classes：预测框的类（逻辑符号从1开始计算）
scores：cls_conf类置信度
category_index：分类字典
thres：cls_conf阈值
line_thickness：box的边缘线宽像素为3

    filter_low_thresh(boxes, scores, classes, category_index, thresh, box_to_display_str_map, box_to_color_map)

对低类置信度cls_conf的预测框进行筛除，这一步只对NMS中multi_label为fasle或者单分类的预测模式有作用，因为经过NMS时在multi_label为True并且多分类时会对cls_conf进行筛选。

    # Draw all boxes onto image.
    if isinstance(image, np.ndarray):
        image = Image.fromarray(image)

将ndarray对象转化为image对象

    draw = ImageDraw.Draw(image)

初始化ImageDraw对象

    im_width, im_height = image.size
    for box, color in box_to_color_map.items():
        xmin, ymin, xmax, ymax = box
        (left, right, top, bottom) = (xmin * 1, xmax * 1,
                                      ymin * 1, ymax * 1)
        draw.line([(left, top), (left, bottom), (right, bottom),
                   (right, top), (left, top)], width=line_thickness, fill=color)

读取box_to_color_map字典在draw对象画出对应坐标的线框

        draw_text(draw, box_to_display_str_map, box, left, right, top, bottom, color)

draw_text的代码详见：draw_text