Faster R-CNN源码学习

weixin_51431157

已于 2023-04-28 16:45:28 修改

阅读量313

点赞数 1

分类专栏：目标检测文章标签： python 开发语言深度学习人工智能机器学习

于 2023-04-25 17:25:23 首次发布

本文链接：https://blog.csdn.net/weixin_51431157/article/details/130369808

版权

目标检测专栏收录该内容

7 篇文章 0 订阅

订阅专栏

通过在源码上debug，并一步步写注释来理解源码。

Faster R-CNN流程图：

预测过程整体可分七步：

1、输入图片

2、利用主干特征提取网络或得共享特征层（获得feature map(执行Resnet50的layer0、1、2、3)）

3、利用共享特征层获得建议框（经过三次卷积对先验框进行调整获得建议框）

4、对建议框解码获得需要截取的位置

5、对共享特征层进行截取(ROIPooling:Resnet50的layer4以及后面的操作)

6、对截取后的特征层呢进行resize和下一步的卷积

7、获得最终的预测结果并解码

一、获得主干特征提取网络（feature map）

resnet50.py文件

ResNet50结构参考下图50-layer

resnet50.py源码文件：

经过layer3（第四次压缩）之后得到feature map（公用特征层），其有两个应用，一个是和ROIPooling结合使用、另一个是进行一次3x3的卷积后，进行一个18通道的1x1卷积，还有一个36通道的1x1卷积。

9 x 4的卷积 用于预测 公用特征层上 每一个网格点上每一个先验框的变化情况。（结果会对这些先验框进行调整，获得一个新的框。）-->得到建议框

9 x 2的卷积 用于预测 公用特征层上 每一个网格点上 每一个预测框内部是否包含了物体，序号为1的内容为包含物体的概率。（判断上述获得的新框是否包含物体。）

import math

import torch.nn as nn
from torch.hub import load_state_dict_from_url


class Bottleneck(nn.Module):
    expansion = 4
    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)

        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * 4)

        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)
        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes=1000):
        #-----------------------------------#
        #   假设输入进来的图片是600,600,3
        #-----------------------------------#
        self.inplanes = 64
        super(ResNet, self).__init__()

        # 600,600,3 -> 300,300,64
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)

        # 300,300,64 -> 150,150,64
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=0, ceil_mode=True)

        # 150,150,64 -> 150,150,256
        self.layer1 = self._make_layer(block, 64, layers[0])   #执行三次残差
        # 150,150,256 -> 75,75,512
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        # 75,75,512 -> 38,38,1024 到这里可以获得一个38,38,1024的共享特征层
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        # self.layer4被用在classifier模型中
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        
        self.avgpool = nn.AvgPool2d(7)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        #-------------------------------------------------------------------#
        #   当模型需要进行高和宽的压缩的时候，就需要用到残差边的downsample
        #-------------------------------------------------------------------#
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )
        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

def resnet50(pretrained = False):
    model = ResNet(Bottleneck, [3, 4, 6, 3])
    if pretrained:
        state_dict = load_state_dict_from_url("https://download.pytorch.org/models/resnet50-19c8e357.pth", model_dir="./model_data")
        model.load_state_dict(state_dict)
    #----------------------------------------------------------------------------#
    #   获取特征提取部分，从conv1到model.layer3，最终获得一个38,38,1024的特征层
    #----------------------------------------------------------------------------#
    features    = list([model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2, model.layer3])
    #----------------------------------------------------------------------------#
    #   获取分类部分，从model.layer4到model.avgpool
    #----------------------------------------------------------------------------#
    classifier  = list([model.layer4, model.avgpool])
    
    features    = nn.Sequential(*features)
    classifier  = nn.Sequential(*classifier)
    return features, classifier

# if __name__ == "__main__":
#     resnet50()

二、先验框的实现

anchors.py文件

目的是将每个像素（特征点）的九个先验框绘制在特征图上。

在代码中对执行的每一步添加了注释。

import numpy as np

#--------------------------------------------#
#   生成基础的先验框
#--------------------------------------------#
def generate_anchor_base(base_size=16, ratios=[0.5, 1, 2], anchor_scales=[8, 16, 32]):
    anchor_base = np.zeros((len(ratios) * len(anchor_scales), 4), dtype=np.float32)  #九行四列（9，4）
    for i in range(len(ratios)):   #循环三次（三种不同比例大小的框） 
        for j in range(len(anchor_scales)):    #循环三次（一种比例有三个框）
            h = base_size * anchor_scales[j] * np.sqrt(ratios[i])  #np.sqrt计算平方根
            w = base_size * anchor_scales[j] * np.sqrt(1. / ratios[i])

            index = i * len(anchor_scales) + j
            anchor_base[index, 0] = - h / 2.
            anchor_base[index, 1] = - w / 2.
            anchor_base[index, 2] = h / 2.
            anchor_base[index, 3] = w / 2.
    return anchor_base   #九行四列的九个建议框位置，四列为X1，Y1，X2，Y2左上角和右下角坐标

#--------------------------------------------#
#   对基础先验框进行拓展对应到所有特征点上
#--------------------------------------------#
def _enumerate_shifted_anchor(anchor_base, feat_stride, height, width):  #（anchor_base，16，38，38）
    #---------------------------------#
    #   计算网格中心点
    #---------------------------------#
    shift_x             = np.arange(0, width * feat_stride, feat_stride)  #生成一个序列（0，16，32，。。。，37*16=592）一共38个点
    shift_y             = np.arange(0, height * feat_stride, feat_stride)  #38个点
    shift_x, shift_y    = np.meshgrid(shift_x, shift_y)   #组合起来，建立网格
    shift               = np.stack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel(),), axis=1)  #堆叠 （1444，4），38*38=1444个特征点

    #---------------------------------#
    #   每个网格点上的9个先验框
    #---------------------------------#
    A       = anchor_base.shape[0]   #9  9个框
    K       = shift.shape[0]   #38*38=1444   1444个象数点
    anchor  = anchor_base.reshape((1, A, 4)) + shift.reshape((K, 1, 4))   #（1，9，4）+（1444，1，4）=（1444，9，4）利用了python的广播机制
    #---------------------------------#
    #   所有的先验框
    #---------------------------------#
    anchor  = anchor.reshape((K * A, 4)).astype(np.float32)   #（12996，4） 1444（象数点）*9（每个象数点的先验框数）=12996个先验框 astype(np.float32)用于将数组的数据类型转换为 float32 类型。
    return anchor
    
if __name__ == "__main__":
    import matplotlib.pyplot as plt
    nine_anchors = generate_anchor_base()   #每个像素点有九个先验框
    print(nine_anchors)     #终端输出九个先验框的位置

    height, width, feat_stride  = 38,38,16
    anchors_all                 = _enumerate_shifted_anchor(nine_anchors, feat_stride, height, width)   #返回12996个先验框的数据
    print(np.shape(anchors_all))   #（12996，4）
    
    fig     = plt.figure()
    #add_subplot(3,4,11)
    #参数一：3，子图总行数；参数二：4，子图总列数；参数三：11，子图位置（将画布分割成3行4列，图像画在从左到右从上到下的第11块。）
    #add_subplot（111）参数111的意思是：将画布分割成1行1列，图像画在从左到右从上到下的第1块。当只想画一张图时就使用111。
    ax      = fig.add_subplot(111) 
    #设置 x，y 轴的数值显示范围 
    plt.ylim(-300,900)
    plt.xlim(-300,900)
    shift_x = np.arange(0, width * feat_stride, feat_stride)  #生成一个序列，三个参数（起始值，终止值，步长）
    shift_y = np.arange(0, height * feat_stride, feat_stride)
    shift_x, shift_y = np.meshgrid(shift_x, shift_y)  #生成笛卡尔坐标系（将x和y结合起来生成一个二维坐标系，目的是建立网格以及网格的坐标）
    plt.scatter(shift_x,shift_y)   #绘制网格图
    box_widths  = anchors_all[:,2]-anchors_all[:,0]   #计算宽度W=X2-X1
    box_heights = anchors_all[:,3]-anchors_all[:,1]   #计算高度H=Y2-Y1
    
    for i in [108, 109, 110, 111, 112, 113, 114, 115, 116]:   #108^9+1=13 第十三个象数点上的九个先验框
    #python matplotlib.Rectangle在一幅图片里作出一个或多个矩形框
    #rectangle函数的参数解释如下： Rectangle(xy,width,heigth,angle=0,**kwargs)
    #xy:2元组，矩形左下角坐标
    #width：矩形的宽度
    #height：矩形的高度
    #angle：float，可选，矩形相对于x轴逆时针旋转角度，默认0
    #fill：bool，可选，是否填充矩形
        rect = plt.Rectangle([anchors_all[i, 0],anchors_all[i, 1]],box_widths[i],box_heights[i],color="r",fill=False)
        ax.add_patch(rect)   #与plt.Rectangle一起使用，用来向Matplotlib绘图中的Axes对象中添加图形对象（例如矩形、圆等）的函数。
    plt.show()

三、通过RPN获得建议框

RPN代码：

RPN分为四步：

1.获得建议框网络的预测结果（三次卷积得到9*2（检测2中的先验框是否包含物体），9*4（检测2中的先验框的调整参数））

2.生成先验框anchor

3.用1对2的先验框进行调整

4.进行筛选（非极大值抑制等操作），最终得到建议框

到此位置还只是粗略的一个框的获取，也就是一个建议框。然后我们会在建议框里面继续找东西。


import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from torchvision.ops import nms
from utils.anchors import _enumerate_shifted_anchor, generate_anchor_base
from utils.utils_bbox import loc2bbox


class ProposalCreator():
    def __init__(
        self, 
        mode, 
        nms_iou             = 0.7,
        n_train_pre_nms     = 12000,
        n_train_post_nms    = 600,
        n_test_pre_nms      = 3000,
        n_test_post_nms     = 300,
        min_size            = 16
    
    ):
        #-----------------------------------#
        #   设置预测还是训练
        #-----------------------------------#
        self.mode               = mode   #training
        #-----------------------------------#
        #   建议框非极大抑制的iou大小
        #-----------------------------------#
        self.nms_iou            = nms_iou   #0.7
        #-----------------------------------#
        #   训练用到的建议框数量
        #-----------------------------------#
        self.n_train_pre_nms    = n_train_pre_nms   #12000
        self.n_train_post_nms   = n_train_post_nms  #600
        #-----------------------------------#
        #   预测用到的建议框数量
        #-----------------------------------#
        self.n_test_pre_nms     = n_test_pre_nms   #3000
        self.n_test_post_nms    = n_test_post_nms  #300
        self.min_size           = min_size     #16

    def __call__(self, loc, score, anchor, img_size, scale=1.):
        if self.mode == "training":
            n_pre_nms   = self.n_train_pre_nms
            n_post_nms  = self.n_train_post_nms
        else:
            n_pre_nms   = self.n_test_pre_nms
            n_post_nms  = self.n_test_post_nms

        #-----------------------------------#
        #   将先验框转换成tensor
        #-----------------------------------#
        anchor = torch.from_numpy(anchor).type_as(loc)
        #-----------------------------------#
        #   将RPN网络预测结果转化成建议框
        #-----------------------------------#
        roi = loc2bbox(anchor, loc)
        #-----------------------------------#
        #   防止建议框超出图像边缘
        #-----------------------------------#
        roi[:, [0, 2]] = torch.clamp(roi[:, [0, 2]], min = 0, max = img_size[1])
        roi[:, [1, 3]] = torch.clamp(roi[:, [1, 3]], min = 0, max = img_size[0])
        
        #-----------------------------------#
        #   建议框的宽高的最小值不可以小于16
        #-----------------------------------#
        min_size    = self.min_size * scale
        keep        = torch.where(((roi[:, 2] - roi[:, 0]) >= min_size) & ((roi[:, 3] - roi[:, 1]) >= min_size))[0]
        #-----------------------------------#
        #   将对应的建议框保留下来
        #-----------------------------------#
        roi         = roi[keep, :]
        score       = score[keep]

        #-----------------------------------#
        #   根据得分进行排序，取出建议框
        #-----------------------------------#
        order       = torch.argsort(score, descending=True)
        if n_pre_nms > 0:
            order   = order[:n_pre_nms]
        roi     = roi[order, :]
        score   = score[order]

        #-----------------------------------#
        #   对建议框进行非极大抑制
        #   使用官方的非极大抑制会快非常多
        #-----------------------------------#
        keep    = nms(roi, score, self.nms_iou)
        if len(keep) < n_post_nms:
            index_extra = np.random.choice(range(len(keep)), size=(n_post_nms - len(keep)), replace=True)
            keep        = torch.cat([keep, keep[index_extra]])
        keep    = keep[:n_post_nms]
        roi     = roi[keep]
        return roi






# #--------------------------------------------#
# #   生成基础的先验框
# #--------------------------------------------#
# def generate_anchor_base(base_size=16, ratios=[0.5, 1, 2], anchor_scales=[8, 16, 32]):
#     anchor_base = np.zeros((len(ratios) * len(anchor_scales), 4), dtype=np.float32)  #九行四列（9，4）
#     for i in range(len(ratios)):   #循环三次（三种不同比例大小的框） 
#         for j in range(len(anchor_scales)):    #循环三次（一种比例有三个框）
#             h = base_size * anchor_scales[j] * np.sqrt(ratios[i])  #np.sqrt计算平方根
#             w = base_size * anchor_scales[j] * np.sqrt(1. / ratios[i])

#             index = i * len(anchor_scales) + j
#             anchor_base[index, 0] = - h / 2.
#             anchor_base[index, 1] = - w / 2.
#             anchor_base[index, 2] = h / 2.
#             anchor_base[index, 3] = w / 2.
#     return anchor_base   #九行四列的九个建议框位置，四列为X1，Y1，X2，Y2左上角和右下角坐标

# def _enumerate_shifted_anchor(anchor_base, feat_stride, height, width):  #（anchor_base，16，38，38）
#     #---------------------------------#
#     #   计算网格中心点
#     #---------------------------------#
#     shift_x             = np.arange(0, width * feat_stride, feat_stride)  #生成一个序列（0，16，32，。。。，37*16=592）一共38个点
#     shift_y             = np.arange(0, height * feat_stride, feat_stride)  #38个点
#     shift_x, shift_y    = np.meshgrid(shift_x, shift_y)   #组合起来，建立网格
#     shift               = np.stack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel(),), axis=1)  #堆叠 （1444，4），38*38=1444个特征点

#     #---------------------------------#
#     #   每个网格点上的9个先验框
#     #---------------------------------#
#     A       = anchor_base.shape[0]   #9  9个框
#     K       = shift.shape[0]   #38*38=1444   1444个象数点
#     anchor  = anchor_base.reshape((1, A, 4)) + shift.reshape((K, 1, 4))   #（1，9，4）+（1444，1，4）=（1444，9，4）利用了python的广播机制
#     #---------------------------------#
#     #   所有的先验框
#     #---------------------------------#
#     anchor  = anchor.reshape((K * A, 4)).astype(np.float32)   #（12996，4） 1444（象数点）*9（每个象数点的建议框数）=12996个建议框 astype(np.float32)用于将数组的数据类型转换为 float32 类型。
#     return anchor







class RegionProposalNetwork(nn.Module):
    def __init__(
        self, 
        in_channels     = 512, 
        mid_channels    = 512,     #优先以外部传入参数为准，传入1024，512  对应in_channels = 1024 ，mid_channels = 512
        ratios          = [0.5, 1, 2],
        anchor_scales   = [8, 16, 32], 
        feat_stride     = 16,
        mode            = "training",
    ):
        super(RegionProposalNetwork, self).__init__()
        #-----------------------------------------#
        #   生成基础先验框，shape为[9, 4]
        #-----------------------------------------#
        self.anchor_base    = generate_anchor_base(anchor_scales = anchor_scales, ratios = ratios)   #（9，4）  一个特征点上九个框左上角和右下角的坐标
        n_anchor            = self.anchor_base.shape[0]  #9个建议框

        #-----------------------------------------#
        #   先进行一个3x3的卷积，可理解为特征整合
        #-----------------------------------------#
        self.conv1  = nn.Conv2d(in_channels, mid_channels, 3, 1, 1)   #一次3*3的卷积
        #-----------------------------------------#
        #   分类预测先验框内部是否包含物体
        #-----------------------------------------#
        self.score  = nn.Conv2d(mid_channels, n_anchor * 2, 1, 1, 0)  #一次1*1的卷积
        #-----------------------------------------#
        #   回归预测对先验框进行调整
        #-----------------------------------------#
        self.loc    = nn.Conv2d(mid_channels, n_anchor * 4, 1, 1, 0)   #一次1*1的卷积    这三次卷积对应流程图左边的操作

        #-----------------------------------------#
        #   特征点间距步长
        #-----------------------------------------#
        self.feat_stride    = feat_stride   #16
        #-----------------------------------------#
        #   用于对建议框解码并进行非极大抑制
        #-----------------------------------------#
        self.proposal_layer = ProposalCreator(mode)   #记录一些参数
        #--------------------------------------#
        #   对FPN的网络部分进行权值初始化
        #--------------------------------------#
        normal_init(self.conv1, 0, 0.01)
        normal_init(self.score, 0, 0.01)
        normal_init(self.loc, 0, 0.01)

    def forward(self, x, img_size, scale=1.):
        n, _, h, w = x.shape
        #-----------------------------------------#
        #   先进行一个3x3的卷积，可理解为特征整合
        #-----------------------------------------#
        x = F.relu(self.conv1(x))    #输入的x是feature map共享特征层（38，38，1024）
        #-----------------------------------------#
        #   回归预测对先验框进行调整
        #-----------------------------------------#
        rpn_locs = self.loc(x)
        rpn_locs = rpn_locs.permute(0, 2, 3, 1).contiguous().view(n, -1, 4)   #将通道数调整至最后一个维度，再进行reshape，成了（batch_size,每一个先验框，rpn_locs：对先验框进行调整获得建议框/rpn_scores：用来判断是否包含物体）
        #-----------------------------------------#
        #   分类预测先验框内部是否包含物体
        #-----------------------------------------#
        rpn_scores = self.score(x)
        rpn_scores = rpn_scores.permute(0, 2, 3, 1).contiguous().view(n, -1, 2)
        
        #--------------------------------------------------------------------------------------#
        #   进行softmax概率计算，每个先验框只有两个判别结果
        #   内部包含物体或者内部不包含物体，rpn_softmax_scores[:, :, 1]的内容为包含物体的概率
        #--------------------------------------------------------------------------------------#
        rpn_softmax_scores  = F.softmax(rpn_scores, dim=-1)
        rpn_fg_scores       = rpn_softmax_scores[:, :, 1].contiguous()
        rpn_fg_scores       = rpn_fg_scores.view(n, -1)

        #------------------------------------------------------------------------------------------------#
        #   生成先验框，此时获得的anchor是布满网格点的，当输入图片为600,600,3的时候，shape为(12996, 4)
        #------------------------------------------------------------------------------------------------#
        anchor = _enumerate_shifted_anchor(np.array(self.anchor_base), self.feat_stride, h, w)
        rois        = list()
        roi_indices = list()
        for i in range(n):
            roi         = self.proposal_layer(rpn_locs[i], rpn_fg_scores[i], anchor, img_size, scale = scale)
            batch_index = i * torch.ones((len(roi),))
            rois.append(roi.unsqueeze(0))
            roi_indices.append(batch_index.unsqueeze(0))

        rois        = torch.cat(rois, dim=0).type_as(x)
        roi_indices = torch.cat(roi_indices, dim=0).type_as(x)
        anchor      = torch.from_numpy(anchor).unsqueeze(0).float().to(x.device)
        
        return rpn_locs, rpn_scores, rois, roi_indices, anchor

def normal_init(m, mean, stddev, truncated=False):
    if truncated:
        m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean)  # not a perfect approximation
    else:
        m.weight.data.normal_(mean, stddev)  #服从高斯分布
        m.bias.data.zero_()   #设置无偏差


# if __name__ == "__main__":
#     #---------------------------------#
#     #   构建classifier网络（构建建议框网络）
#     #---------------------------------#
#     rpn = RegionProposalNetwork(
#         1024, 512,
#         # ratios          = [0.5, 1, 2],
#         # anchor_scales   = [8, 16, 32],
#         # feat_stride     = 16,
#     )

图中三次卷积（获得对先验框的调整参数和预测是否有物体）对应在RPN中的代码

# 先进行一个3x3的卷积，可理解为特征整合

self.conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1) #一次3*3的卷积

# 分类预测先验框内部是否包含物体

self.score = nn.Conv2d(mid_channels, n_anchor * 2, 1, 1, 0) #一次1*1的卷积

# 回归预测对先验框进行调整

self.loc = nn.Conv2d(mid_channels, n_anchor * 4, 1, 1, 0) #一次1*1的卷积这三次卷积对应流程图左边的操作

def forward(self, x, img_size, scale=1.):

n, _, h, w = x.shape

# 先进行一个3x3的卷积，可理解为特征整合

x = F.relu(self.conv1(x)) #输入的x是feature map共享特征层（38，38，1024）

# 回归预测对先验框进行调整

rpn_locs = self.loc(x)

rpn_locs = rpn_locs.permute(0, 2, 3, 1).contiguous().view(n, -1, 4)

# 分类预测先验框内部是否包含物体

rpn_scores = self.score(x)

rpn_scores = rpn_scores.permute(0, 2, 3, 1).contiguous().view(n, -1, 2)

对Proposal建议框的解码：

class ProposalCreator():
    def __init__(
        self, 
        mode, 
        nms_iou             = 0.7,
        n_train_pre_nms     = 12000,
        n_train_post_nms    = 600,
        n_test_pre_nms      = 3000,
        n_test_post_nms     = 300,
        min_size            = 16
    
    ):
        #-----------------------------------#
        #   设置预测还是训练
        #-----------------------------------#
        self.mode               = mode
        #-----------------------------------#
        #   建议框非极大抑制的iou大小
        #-----------------------------------#
        self.nms_iou            = nms_iou
        #-----------------------------------#
        #   训练用到的建议框数量
        #-----------------------------------#
        self.n_train_pre_nms    = n_train_pre_nms
        self.n_train_post_nms   = n_train_post_nms
        #-----------------------------------#
        #   预测用到的建议框数量
        #-----------------------------------#
        self.n_test_pre_nms     = n_test_pre_nms
        self.n_test_post_nms    = n_test_post_nms
        self.min_size           = min_size

    def __call__(self, loc, score, anchor, img_size, scale=1.):
        if self.mode == "training":
            n_pre_nms   = self.n_train_pre_nms
            n_post_nms  = self.n_train_post_nms
        else:
            n_pre_nms   = self.n_test_pre_nms
            n_post_nms  = self.n_test_post_nms

        #-----------------------------------#
        #   将先验框转换成tensor
        #-----------------------------------#
        anchor = torch.from_numpy(anchor)
        if loc.is_cuda:
            anchor = anchor.cuda()
        #-----------------------------------#
        #   将RPN网络预测结果转化成建议框
        #-----------------------------------#
        roi = loc2bbox(anchor, loc)
        #-----------------------------------#
        #   防止建议框超出图像边缘
        #-----------------------------------#
        roi[:, [0, 2]] = torch.clamp(roi[:, [0, 2]], min = 0, max = img_size[1])
        roi[:, [1, 3]] = torch.clamp(roi[:, [1, 3]], min = 0, max = img_size[0])
        
        #-----------------------------------#
        #   建议框的宽高的最小值不可以小于16
        #-----------------------------------#
        min_size    = self.min_size * scale
        keep        = torch.where(((roi[:, 2] - roi[:, 0]) >= min_size) & ((roi[:, 3] - roi[:, 1]) >= min_size))[0]
        #-----------------------------------#
        #   将对应的建议框保留下来
        #-----------------------------------#
        roi         = roi[keep, :]
        score       = score[keep]

        #-----------------------------------#
        #   根据得分进行排序，取出建议框
        #-----------------------------------#
        order       = torch.argsort(score, descending=True)
        if n_pre_nms > 0:
            order   = order[:n_pre_nms]
        roi     = roi[order, :]
        score   = score[order]

        #-----------------------------------#
        #   对建议框进行非极大抑制
        #   使用官方的非极大抑制会快非常多
        #-----------------------------------#
        keep    = nms(roi, score, self.nms_iou)
        keep    = keep[:n_post_nms]
        roi     = roi[keep]
        return roi

四、ROIPooling(对Proposal建议框加以利用)

建议框会对公用特征层feature map（维度为38*38*1024）的38x38个区域进行截取，也就是认为这些区域里存在目标，然后将截取的结果进行resize，resize到14x14x1024的大小。

然后再对每个建议框再进行Resnet原有的第五次压缩（layer4）。压缩完后进行一个平均池化，再进行一个Flatten，最后分别进行一个num_classes的全连接和(num_classes)x4全连接。

（num_classes的全连接用于对最后获得的框进行分类，(num_classes)x4全连接用于对相应的建议框进行调整。）

对Proposal建议框加以利用的过程与shape变化如图所示：

class Resnet50RoIHead(nn.Module):
    def __init__(self, n_class, roi_size, spatial_scale, classifier):
        super(Resnet50RoIHead, self).__init__()
        self.classifier = classifier
        #--------------------------------------#
        #   对ROIPooling后的的结果进行回归预测
        #--------------------------------------#
        self.cls_loc = nn.Linear(2048, n_class * 4)
        #-----------------------------------#
        #   对ROIPooling后的的结果进行分类
        #-----------------------------------#
        self.score = nn.Linear(2048, n_class)
        #-----------------------------------#
        #   权值初始化
        #-----------------------------------#
        normal_init(self.cls_loc, 0, 0.001)
        normal_init(self.score, 0, 0.01)

        self.roi = RoIPool((roi_size, roi_size), spatial_scale)

    def forward(self, x, rois, roi_indices, img_size):
        n, _, _, _ = x.shape
        if x.is_cuda:
            roi_indices = roi_indices.cuda()
            rois = rois.cuda()
        rois        = torch.flatten(rois, 0, 1)   #(300,4)
        roi_indices = torch.flatten(roi_indices, 0, 1)   #(300,)
        
        rois_feature_map = torch.zeros_like(rois)   #(300,4)
        rois_feature_map[:, [0,2]] = rois[:, [0,2]] / img_size[1] * x.size()[3]
        rois_feature_map[:, [1,3]] = rois[:, [1,3]] / img_size[0] * x.size()[2]

        indices_and_rois = torch.cat([roi_indices[:, None], rois_feature_map], dim=1)
        #-----------------------------------#
        #   利用建议框对公用特征层进行截取
        #-----------------------------------#
        pool = self.roi(x, indices_and_rois)
        #-----------------------------------#
        #   利用classifier网络进行特征提取
        #-----------------------------------#
        fc7 = self.classifier(pool)
        #--------------------------------------------------------------#
        #   当输入为一张图片的时候，这里获得的f7的shape为[300, 2048]
        #--------------------------------------------------------------#
        fc7 = fc7.view(fc7.size(0), -1)

        roi_cls_locs    = self.cls_loc(fc7)  #(300,84)
        roi_scores      = self.score(fc7)
        roi_cls_locs    = roi_cls_locs.view(n, -1, roi_cls_locs.size(1))
        roi_scores      = roi_scores.view(n, -1, roi_scores.size(1))
        return roi_cls_locs, roi_scores

最后即可进行预测框绘制