YOLOv3-step1根据网络预测结果求预测框

最新推荐文章于 2024-09-18 09:53:12 发布

Mr.Q

最新推荐文章于 2024-09-18 09:53:12 发布

阅读量428

点赞数

分类专栏： PyTorch Object Detection YOLO

本文链接：https://blog.csdn.net/jizhidexiaoming/article/details/109079478

版权

PyTorch 同时被 3 个专栏收录

38 篇文章 21 订阅

订阅专栏

YOLO

13 篇文章 5 订阅

订阅专栏

Object Detection

11 篇文章 0 订阅

订阅专栏

def predict_transform(prediction, inp_dim, anchors, num_classes, CUDA=True):
    """
    x = torch.nn.Conv2d(x.shape[1], (5+num_classes)*len(anchors), kernel_size=1, stride=1, padding=0)(x)

    https://blog.paperspace.com/how-to-implement-a-yolo-v3-object-detector-from-scratch-in-pytorch-part-3/
    :param prediction: [2, 255, 13, 13]. 这个255=(1+4+80)*3
    :param inp_dim: 416. 图片原始大小
    :param anchors: [(116, 90), (156, 198), (373, 326)]. 在原图上的预设anchors大小hw
    :param num_classes: 80. coco类别数
    :param CUDA: bool. 是否加速
    :return: [b, bbox_num, 85]. bbox_num = 13*13*3 / 26*26*3 / 52*52*3. 85=x, y, h, w, cls_0, cls_1, ..., cls_79. 
    back to origin size.
    """
    batch_size = prediction.size(0)  # 2. batch_size
    stride = inp_dim // prediction.size(2)  # 32. 下降倍数
    grid_size = inp_dim // stride  # 13. 最后预测特征图大小是13x13，每个像素是一个网格
    bbox_attrs = 5 + num_classes  # 85. 一个预测框具有的属性有85个，1+4+80
    num_anchors = len(anchors)  # 3. 当前预测层，每个网格需要预测的anchor个数。

    #  [2, 255, 13, 13] -> [2, 255, 13*13]. 即[b, 所有anchor的所有属性, 所有像素]
    prediction = prediction.view(batch_size, bbox_attrs * num_anchors, grid_size * grid_size)
    # [b, 13*13, 85*3]，即[b, 所有像素, 所有属性], 即一行代表一个网格的所有预测
    prediction = prediction.transpose(1, 2).contiguous()
    # [b,13*13,85*3]->[b,13*13*3,85]，即一行代表一个预测框的所有属性。
    prediction = prediction.view(batch_size, grid_size * grid_size * num_anchors, bbox_attrs)
    anchors = [(a[0] / stride, a[1] / stride) for a in anchors]

    # Sigmoid the  centre_X, centre_Y. and object confidencce, -> [0, 1]
    prediction[:, :, 0] = torch.sigmoid(prediction[:, :, 0])  # centre_X
    prediction[:, :, 1] = torch.sigmoid(prediction[:, :, 1])  # centre_Y
    prediction[:, :, 4] = torch.sigmoid(prediction[:, :, 4])  # object confidencce

    # Add the center offsets
    grid = np.arange(grid_size)  # [0, 1, 2, ..., 12]
    a, b = np.meshgrid(grid, grid)  # x, y. 从左到右，从上到下遍历特征图的索引。
    """
    a = [
    [ 0  1  2  3  4  5  6  7  8  9 10 11 12], 
    [ 0  1  2  3  4  5  6  7  8  9 10 11 12], 
    [ 0  1  2  3  4  5  6  7  8  9 10 11 12], 
    [ 0  1  2  3  4  5  6  7  8  9 10 11 12], 
    [ 0  1  2  3  4  5  6  7  8  9 10 11 12], 
    [ 0  1  2  3  4  5  6  7  8  9 10 11 12], 
    [ 0  1  2  3  4  5  6  7  8  9 10 11 12], 
    [ 0  1  2  3  4  5  6  7  8  9 10 11 12], 
    [ 0  1  2  3  4  5  6  7  8  9 10 11 12], 
    [ 0  1  2  3  4  5  6  7  8  9 10 11 12], 
    [ 0  1  2  3  4  5  6  7  8  9 10 11 12], 
    [ 0  1  2  3  4  5  6  7  8  9 10 11 12], 
    [ 0  1  2  3  4  5  6  7  8  9 10 11 12]]
    
    b = [
    [ 0  0  0  0  0  0  0  0  0  0  0  0  0], 
    [ 1  1  1  1  1  1  1  1  1  1  1  1  1], 
    [ 2  2  2  2  2  2  2  2  2  2  2  2  2], 
    [ 3  3  3  3  3  3  3  3  3  3  3  3  3], 
    [ 4  4  4  4  4  4  4  4  4  4  4  4  4], 
    [ 5  5  5  5  5  5  5  5  5  5  5  5  5], 
    [ 6  6  6  6  6  6  6  6  6  6  6  6  6], 
    [ 7  7  7  7  7  7  7  7  7  7  7  7  7], 
    [ 8  8  8  8  8  8  8  8  8  8  8  8  8], 
    [ 9  9  9  9  9  9  9  9  9  9  9  9  9], 
    [10 10 10 10 10 10 10 10 10 10 10 10 10], 
    [11 11 11 11 11 11 11 11 11 11 11 11 11], 
    [12 12 12 12 12 12 12 12 12 12 12 12 12]]
    """

    x_offset = torch.FloatTensor(a).view(-1, 1)  #
    y_offset = torch.FloatTensor(b).view(-1, 1)
    """
    x_offset = [[ 0.], [ 1.], [ 2.], [ 3.], [ 4.], [ 5.], [ 6.], [ 7.], [ 8.], [ 9.], [10.], [11.], [12.], [ 0.], [ 1.], 
    [ 2.], ..., [12.], [12.]]
    
    y_offset = [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [1.], [1.], [1.], [1.], 
    [1.], [1.], [1.], [1.], [1.], [1.], [1.], [1.], [1.], [2.], [2.], ..., [12.], [12.]]
    """

    if CUDA:
        x_offset = x_offset.cuda()
        y_offset = y_offset.cuda()
        prediction = prediction.cuda()
    # -> [1, 13*13*3, 2], 即[1, 所有网格个数，每个网格的索引]，索引扫面的顺序是从左到右，从上到下。
    x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1, num_anchors).view(-1, 2).unsqueeze(0)
    """
    索引扫面的顺序是从左到右，从上到下。
    x_y_offset[0, :, :] = 【
    [ 0.  0.], 
    [ 0.  0.], 
    [ 0.  0.], 
    [ 1.  0.], 
    [ 1.  0.], 
    [ 1.  0.], 
    [ 2.  0.], 
    ...
    [11. 12.],
    [11. 12.],
    [11. 12.],
    [12. 12.],
    [12. 12.],
    [12. 12.]
    """

    # 1， xy
    """
    x = x_pred + x_cell
    y = y_pred + y_cell  
    """
    prediction[:, :, :2] += x_y_offset

    # log space transform height and the width
    anchors = torch.FloatTensor(anchors)

    if CUDA:
        anchors = anchors.cuda()

    anchors = anchors.repeat(grid_size * grid_size, 1).unsqueeze(0)  # [3, 2] -> [1, 507, 2]，即[1, 所有anchor个数，hw]
    """
    anchors = [
    [ 3.625    2.8125 ], 
    [ 4.875    6.1875 ], 
    [11.65625 10.1875 ], 
    
    [ 3.625    2.8125 ], 
    [ 4.875    6.1875 ], 
    [11.65625 10.1875 ], 
    ...
    """
    # 2, hw，根据预设的anchor，调整预测的hw
    # x, y, h, w, cls_0, cls_1, ..., cls_79
    # prediction: [2, 507, 85], 507代表所有预测框。prediction[:, :, 2:4]的shape： [2, 13*13*3, 2]
    prediction[:, :, 2:4] = torch.exp(prediction[:, :, 2:4]) * anchors  # anchors.shape: [1, 13*13*3, 2]

    # 3, cls
    prediction[:, :, 5: 5 + num_classes] = torch.sigmoid((prediction[:, :, 5: 5 + num_classes]))

    # return to origin size. xyhw
    prediction[:, :, :4] *= stride

    return prediction