Pytorch 实现Faster RCNN（未完）

最新推荐文章于 2024-05-16 19:00:16 发布
Likw也被注册了
最新推荐文章于 2024-05-16 19:00:16 发布
阅读量692
点赞数
分类专栏： Pytorch 文章标签： pytorch cv anchor
本文链接：https://blog.csdn.net/qq_39328621/article/details/119897155
版权
Pytorch 专栏收录该内容
1 篇文章 0 订阅
订阅专栏
训练部分

数据集：VOC2007
预训练主干网络，自行下载即可
import pdb  # Debug工具
import math

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.model_zoo as model_zoo
from torch.autograd import Variable

torch.manual_seed(1)
'''
输入图片->主干网络获得共享特征层->共享特征层获得建议框->建议框解码获得截取的位置->特征层截取->resize和conv->获得最终预测结果并解码

Conv Block 输入和输出的维度是不一样的，不能连续串联，用于改变网络的维度
Identity Block 输入维度和输出维度相同，可以串联，用于加深网络的
'''


# 1. 主干网络 Conv layers:  feature maps被共享用于后续RPN层(RegionProposalNet) 和全连接层(Resnet50RoIHead)
class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)  # 数据的归一化处理, 使得数据在进行Relu之前不会因为数据过大而导致网络性能的不稳定. p:特征的数量

        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * 4)

        self.relu = nn.ReLU(inplace=True)  # True 将会改变输入的数据 ，Flase 不会改变原输入，只会产生新的输出
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)
        return out


class Resnet(nn.Module):
    def __init__(self, block, layers, num_classes=1000):
        # image = [600, 600, 3]
        self.inplanes = 64
        super(Resnet, self).__init__()

        # [600, 600, 3] -> [300, 300, 64]
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)

        # [300, 300, 64] -> [150, 150, 64]
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=0, ceil_mode=True)  # True，向上取整

        # [150, 150, 64] -> [150, 150, 256]
        self.layer1 = self._make_layer(block, 64, layers[0])

        # [150,150,256] -> [75,75,512]
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)

        # [75,75,512] -> [38,38,1024] 到这里可以获得一个[38,38,1024]的共享特征层
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)

        # self.layer4被用在classifier模型中
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

        self.avgpool = nn.AvgPool2d(7)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        # 当模型需要进行高和宽的压缩的时候，就需要用到残差边的downsample
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )
        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x


def resnet50():
    model = Resnet(Bottleneck, [3, 4, 6, 3])

    # 获取特征提取部分，从 conv1 到 model.layer3，最终获得一个38,38,1024的特征层
    features = list([model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2, model.layer3])

    # 获取分类部分，从model.layer4 到 model.avgpool

    classifier = list([model.layer4, model.avgpool])

    features = nn.Sequential(*features)
    classifier = nn.Sequential(*classifier)

    # features 为公用特征层，classifier 为 第二阶段分类器
    return features, classifier


# 2.1 Proposal建议框, 生成 anchor
def generate_anchor_base(base_size=16, ratios=[0.5, 1, 2], anchor_scales=[8, 16, 32]):
    anchor_base = np.zeros((len(ratios) * len(anchor_scales), 4), dtype=np.float32)
    # print(anchor_base)
    for i in range(len(ratios)):
        for j in range(len(anchor_scales)):
            h = base_size * anchor_scales[j] * np.sqrt(ratios[i])
            w = base_size * anchor_scales[j] * np.sqrt(1. / ratios[i])

            index = i * len(anchor_scales) + j
            anchor_base[index, 0] = - h / 2.
            anchor_base[index, 1] = - w / 2.
            anchor_base[index, 2] = h / 2.
            anchor_base[index, 3] = w / 2.
    # print(anchor_base)
    return anchor_base


def _enumerate_shifted_anchor(anchor_base, feat_stride, height, width):
    # 1. 在原图上生成anchor 的过程， 假设height = 4， width = 4，则K = 4*4 = 16
    # 计算网格中心点
    # width = 4， shift_x = [0, 16, 32, 48]
    shift_x = np.arange(0, width * feat_stride, feat_stride)

    # height = 4, shift_y = [0, 16, 32, 48]
    shift_y = np.arange(0, height * feat_stride, feat_stride)

    # 生成网格 shift_x = [ [0, 16, 32, 48],  [0, 16, 32, 48], [0, 16, 32, 48], [0, 16, 32, 48] ]
    # 生成网格 shift_y = [ [0, 0, 0, 0],  [16, 16, 16, 16], [32, 32, 32, 32], [48, 48, 48, 48] ]
    shift_x, shift_y = np.meshgrid(shift_x, shift_y)  # 生成网格点坐标矩阵

    # ravel() 将数组维度拉成一维数组, stack 沿着新轴连接数组的序列
    # shift.T = [ [0, 16, 32, 48, 0, 16, 32, 48, 0, 16, 32, 48, 0, 16, 32, 48],
    #           [0, 0, 0, 0, 16, 16, 16, 16, 32, 32, 32, 32, 48, 48, 48, 48],
    #           [0, 16, 32, 48, 0, 16, 32, 48, 0, 16, 32, 48, 0, 16, 32, 48],
    #           [0, 0, 0, 0, 16, 16, 16, 16, 32, 32, 32, 32, 48, 48, 48, 48] ]
    # shift = np.stack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel(),), axis=1)
    shift = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel(),)).transpose()

    # 2. 生成base anchor
    # 每个网格点上的9个先验框 shape: [9, 4]
    A = anchor_base.shape[0]
    # K表示 feature map 有多少个像素点 4*4 = 16
    K = shift.shape[0]

    # 3. 把9 个anchors 分别移动到shifts 所给出的K(16) 个位置上
    # 将anchor 左上点的坐标和shifts 的前两列加起来，右下点的坐标和shifts 的后两列加起来
    # 因为shifts 的前两列和后两列相同，意味着anchor 的左上点和右下点都平移了相同的距离
    anchor = anchor_base.reshape((1, A, 4)) + shift.reshape((K, 1, 4))

    # 所有的先验框, 每幅图共生成 K * A 个anchor，用4 个坐标表示
    anchor = anchor.reshape((K * A, 4)).astype(np.float32)

    return anchor


# 2.2 Proposal建议框, Region Proposal Networks: 用于生成region proposals。该层通过softmax判断anchors属于positive或者negative，
#                                              再利用bounding box regression修正anchors获得精确的proposals(rois)
'''
1. 对于输入的feature map先用rpn_conv进行卷积
2. 然后使用rpn_cls卷积层得到分类结果
3. 同时使用rpn_reg卷积层得到回归结果
4. 然后之后再调用proposal函数得到proposals(rois)
5. 如果是训练过程，那么使用调用anchor_target产生rpn网络中分类和回归的ground truth值，之后在计算rpn的loss时会用到
6. 如果是训练过程，那么计算分类loss
7. 如果是训练过程，那么计算回归loss
'''


class RegionProposalNet(nn.Module):
    def __init__(self, in_channels=512, mid_channels=512, ratios=[0.5, 1, 2], anchor_scales=[8, 16, 32],
                 feat_stride=16, mode='training'):
        super(RegionProposalNet, self).__init__()
        # 步长，压缩的倍数
        self.feat_stride = feat_stride
        self.proposal_layer = ProposalCreator(mode)
        # 生成基础先验框， shape[9, 4]
        self.anchor_base = generate_anchor_base(anchor_scales=anchor_scales, ratios=ratios)
        n_anchor = self.anchor_base.shape[0]

        # [38,38,1024] -> [38,38,512]
        # 先进行一个 3*3 的卷积 -> 特征整合
        self.conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1)
        # [38,38,512] -> [38,38,18]
        # 分类预测先验框内部是否包含物体
        self.score = nn.Conv2d(mid_channels, n_anchor * 2, 1, 1, 0)
        # [38,38,512] -> [38,38,36]
        # 回归预测对先验框进行调整
        self.loc = nn.Conv2d(mid_channels, n_anchor * 4, 1, 1, 0)

        # 对FPN的网络部分进行权值初始化
        normal_init(self.conv1, 0, 0.01)
        normal_init(self.score, 0, 0.01)
        normal_init(self.loc, 0, 0.01)

    def forward(self, x, img_size, scale=1.):
        n, c, h, w = x.shape
        # print('fearture_size_229:', x.shape)
        # 先进行一个3 * 3 的卷积 -> 特征整合
        x = self.conv1(x)
        x = F.relu(x)

        # 1. Conv 回归预测对先验框进行调整, rpn_locs: 先验框的调整参数
        rpn_locs = self.loc(x)
        # permute: 维度换位， 将通道调整到最后一个维度
        # contiguous: 返回一个内存连续的有相同数据的tensor，如果原tensor内存连续，则返回原tensor
        # 2. Reshape [n, 38×38×9, 4]
        rpn_locs = rpn_locs.permute(0, 2, 3, 1).contiguous().view(n, -1, 4)

        # 1. Conv 分类预测先验框内部是否包含物体, rpn_scores: 先验框的得分
        rpn_scores = self.score(x)
        # 2. Reshape [n, 38×38×9, 2]
        rpn_scores = rpn_scores.permute(0, 2, 3, 1).contiguous().view(n, -1, 2)
        # view() 把原先tensor中的数据按照行优先的顺序排成一个一维的数据（应该是因为要求地址是连续存储的），然后按照参数组合成其他维度的tensor
        # 此处第0维度变成 batch_size，第1维度变成了先验框，最后一个维度为判断先验框是否包含物体

        # 3. Softmax 概率计算，每个先验框只有两个判别结果
        # 内部包含物体或者内部不包含物体，rpn_softmax_scores[:, :, 1]的内容为包含物体的概率
        rpn_softmax_scores = F.softmax(rpn_scores, dim=-1)  # -1: 行和为1
        # 4. Reshape [n, 38×38×9]
        rpn_fg_scores = rpn_softmax_scores[:, :, 1].contiguous().view(n, -1)
        # rpn_fg_scores = rpn_fg_scores.view(n, -1)

        # 生成先验框，此时获得的anchor是布满网格点的，当输入图片为600,600,3的时候，shape为(12996, 4)
        anchor = _enumerate_shifted_anchor(np.array(self.anchor_base), self.feat_stride, h, w)
        # print('anchor_258:', anchor[0:10])

        rois = list()
        roi_indices = list()
        for i in range(n):
            roi = self.proposal_layer(rpn_locs[i], rpn_fg_scores[i], anchor, img_size, scale=scale)
            batch_index = i * torch.ones((len(roi),))
            rois.append(roi)
            roi_indices.append(batch_index)

        rois = torch.cat(rois, dim=0)
        roi_indices = torch.cat(roi_indices, dim=0)  # 列

        return rpn_locs, rpn_scores, rois, roi_indices, anchor


def normal_init(m, mean, stddev, truncated=False):
    if truncated:
        m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean)  # # not a perfect approximation
        # fmod_: 给出除数除以元素的余数    mul_: 应位相乘  add_: 两个张量相加    _: in-place 操作
    else:
        m.weight.data.normal_(mean=mean, std=stddev)
        m.bias.data.zero_()


# 3 Proposal建议框的解码
from torchvision.ops import nms


class ProposalCreator():
    # 通过nms(极大值抑制)去除重复率高的anchors，在剩下的anchors 选取前600/300个anchors作为建议框输出
    def __init__(self, mode, nms_thresh=0.7,
                 n_train_pre_nms=12000,
                 n_train_post_nms=600,
                 n_test_pre_nms=3000,
                 n_test_post_nms=300,
                 min_size=16):
        self.mode = mode
        self.nms_thresh = nms_thresh
        self.n_train_pre_nms = n_train_pre_nms
        self.n_train_post_nms = n_train_post_nms
        self.n_test_pre_nms = n_test_pre_nms
        self.n_test_post_nms = n_test_post_nms
        self.min_size = min_size

    def __call__(self, loc, score, anchor, img_size, scale=1.):
        if self.mode == "training":
            n_pre_nms = self.n_train_pre_nms
            n_post_nms = self.n_train_post_nms
        else:
            n_pre_nms = self.n_test_pre_nms
            n_post_nms = self.n_test_post_nms

        anchor = torch.from_numpy(anchor)
        if loc.is_cuda:
            anchor = anchor.cuda()
        # 将RPN 网络预测结果 转换为 建议框
        # [38×38×9, 4]
        roi = loc2bbox(anchor, loc)
        # print('roi_316:', roi.size(), roi[:, :][(roi[:, 2] - roi[:, 0]) != 0].size())

        # 防止建议框超出图像边缘
        roi[:, [0, 2]] = torch.clamp(roi[:, [0, 2]], min=0, max=img_size[1])  # 每个元素的范围限制到区间 [min,max]
        roi[:, [1, 3]] = torch.clamp(roi[:, [1, 3]], min=0, max=img_size[0])
        # print('roi_323:', roi.size(), roi[:, :][(roi[:, 2] - roi[:, 0]) != 0].size())

        # 建议框的宽高的最小值≥ 16
        min_size = self.min_size * scale
        # 计算宽高
        ws = roi[:, 2] - roi[:, 0]
        hs = roi[:, 3] - roi[:, 1]
        # 防止建议框过小
        keep = torch.where((ws >= min_size) & (hs >= min_size))[0]  # 从x,y中选择元素所组成的张量
        # print('keep_332:', keep, 'min_size:', min_size,
        #       'ws:', ws[ws >= min_size].size(),
        #       'hs:', hs[hs >= min_size].size(),
        #       'hs:', max(hs))
        roi = roi[keep, :]
        # print('roi_333:', roi.size())
        # score [38×38×9]
        score = score[keep]

        # 根据得分排序，取出建议框
        order = torch.argsort(score, descending=True)  # 返回一个排序好的列表值的索引
        if n_pre_nms > 0:
            order = order[:n_pre_nms]
        roi = roi[order, :]
        score = score[order]

        # 对建议框进行非极大抑制, 防止一定区域内的建议框过多,将一定区域内得分最高的建议框取出
        keep = nms(roi, score, self.nms_thresh)  # keep :NMS过滤后的bouding boxes索引（降序排列）
        keep = keep[:n_post_nms]
        roi = roi[keep]
        return roi


def loc2bbox(src_bbox, loc):
    # src_bbox: 先验框         loc: 建议框网络的预测结果
    # 通过anchor的左上角（xmin,ymin）和右下角坐标(xmax,ymax)，和anchor宽w、高h，和偏移量(dx,dy,dw,dh),
    # 通过公式求出建议框的左上角与右下角坐标集合dst_bbox(xl,yl,xr,yr)
    if src_bbox.size()[0] == 0:
        return torch.zeros((0, 4), dtype=loc.dtype)

    # 先验框的宽高
    src_width = torch.unsqueeze(src_bbox[:, 2] - src_bbox[:, 0], -1)  # 对输入的既定位置插入维度1,若dim为负，则将会被转化dim+input.dim()+1
    src_height = torch.unsqueeze(src_bbox[:, 3] - src_bbox[:, 1], -1)
    # 先验框的中心
    src_ctr_x = torch.unsqueeze(src_bbox[:, 0], -1) + 0.5 * src_width
    src_ctr_y = torch.unsqueeze(src_bbox[:, 1], -1) + 0.5 * src_height

    dx = loc[:, 0::4]
    dy = loc[:, 1::4]
    dw = loc[:, 2::4]
    dh = loc[:, 3::4]

    # 调整后的 先验框的宽高
    ctr_x = dx * src_width + src_ctr_x
    ctr_y = dy * src_height + src_ctr_y
    # 调整后的 先验框的中心
    w = torch.exp(dw) * src_width
    h = torch.exp(dh) * src_height

    # [38×38×9, 4]
    dst_bbox = torch.zeros_like(loc)
    # 格式转变: 左上角右下角
    dst_bbox[:, 0::4] = ctr_x - 0.5 * w
    dst_bbox[:, 1::4] = ctr_y - 0.5 * h
    dst_bbox[:, 2::4] = ctr_x + 0.5 * w
    dst_bbox[:, 3::4] = ctr_y + 0.5 * h

    # 调整后的先验框，即 尚未经过筛选的建议框
    return dst_bbox


# 4. 对Proposal 建议框加以利用(RoiPoolingConv): 收集输入的feature maps和proposals，综合这些信息后提取proposal feature maps，
#                                            送入后续全连接层,利用proposal feature maps计算proposal的类别，
#                                            同时再次bounding box regression获得检测框最终的精确位置
# RoI Pooling: 对非均匀尺寸的输入执行最大池化以获得固定尺寸的特征图
from torchvision.ops import RoIPool


class Resnet50RoIHead(nn.Module):
    def __init__(self, n_class, roi_size, spatial_scale, classifier):
        # n_class includes the background
        super(Resnet50RoIHead, self).__init__()
        # 获得用于分类的层
        self.classifier = classifier
        # 对ROIPooling后的的结果进行回归预测
        self.cls_loc = nn.Linear(2048, n_class * 4)
        # 对ROIPooling后的的结果进行分类
        self.score = nn.Linear(2048, n_class)

        normal_init(self.cls_loc, 0, 0.001)
        normal_init(self.score, 0, 0.01)
        # 分多少个类，包括背景
        self.n_class = n_class

        # 以VGG为backbone时，roi_size = 7
        self.roi_size = roi_size
        self.spatial_scale = spatial_scale
        self.roi = RoIPool((self.roi_size, self.roi_size), self.spatial_scale)

    def forward(self, x, rois, roi_indices, img_size):
        n, _, _, _ = x.shape
        if x.is_cuda:
            roi_indices = torch.Tensor(roi_indices).cuda().float()
            rois = torch.Tensor(rois).cuda().float()
        else:
            roi_indices = torch.Tensor(roi_indices).float()
            rois = torch.Tensor(rois).float()
        indices_and_rois = torch.cat([roi_indices[:, None], rois], dim=1)  # [index, x1, y1, x2, y2]

        xy_indices_and_rois = indices_and_rois[:, [0, 1, 2, 3, 4]]  # # [index, x1, y1, x2, y2]
        indices_and_rois = xy_indices_and_rois.contiguous()
        # 利用建议框对公用特征进行截取
        pool = self.roi(x, indices_and_rois)
        # [300, 2048, 1, 1]
        fc7 = self.classifier(pool)
        # [300, 2048]
        fc7 = fc7.view(fc7.size(0), -1)
        roi_cls_locs = self.cls_loc(fc7)
        roi_scores = self.score(fc7)
        roi_cls_locs = roi_cls_locs.view(n, -1, roi_cls_locs.size(1))
        roi_scores = roi_scores.view(n, -1, roi_scores.size(1))
        return roi_cls_locs, roi_scores


# 7. 建议框网络训练
# 编码    产生rpn网络中分类和回归的ground truth值，之后在计算rpn的loss时会用到
class AnchorTargetCreator(object):
    def __init__(self, n_sample=256, pos_iou_thresh=0.7, neg_iou_thresh=0.3, pos_ratio=0.5):
        self.n_sample = n_sample
        self.pos_iou_thresh = pos_iou_thresh
        self.neg_iou_thresh = neg_iou_thresh
        self.pos_ratio = pos_ratio

    def __call__(self, bbox, anchor, img_size):
        argmax_ious, label = self._create_label(anchor, bbox)

        # 利用先验框和其对应的真实框进行编码
        loc = bbox2loc(anchor, bbox[argmax_ious])
        return loc, label

    def _create_label(self, anchor, bbox):
        # 1是正样本, 0是负样本, -1忽略
        label = np.empty((len(anchor),), dtype=np.int32)  # 根据给定的维度和数值类型返回一个新的数组，其元素不进行初始化
        label.fill(-1)  # 返回给定维度和数值类型的新数组，填充了 fill_value

        # argmax_ious 为每个先验框对应的最大的真实框的序号
        # max_ious 为每个先验框对应的最大的真实框的iou
        # gt_argmax_ious 为每一个真实框对应的最大的先验框的序号
        argmax_ious, max_ious, gt_argmax_ious = self._calc_ious(anchor, bbox)

        # 如果小于门限函数则设置为负样本
        label[max_ious < self.neg_iou_thresh] = 0

        # 每个真实框至少对应一个先验框，即当某个先验框和真实框有最大IoU值，那该anchor被认为是正样本
        label[gt_argmax_ious] = 1

        # 如果大于门限函数则设置为正样本
        label[max_ious >= self.pos_iou_thresh] = 1

        # 判断正样本数量是否大于128，如果大于的话则去掉一些
        n_pos = int(self.pos_ratio * self.n_sample)
        pos_index = np.where(label == 1)[0]     # 输出满足条件 (即非0) 元素的坐标
        if len(pos_index) > n_pos:
            disable_index = np.random.choice(pos_index, size=(len(pos_index) - n_pos), replace=False)
            label[disable_index] = -1

        # 平衡正负样本，保持总数量为256
        n_neg = self.n_sample - np.sum(label == 1)
        neg_index = np.where(label == 0)[0]     # 输出满足条件 (即非0) 元素的坐标
        if len(neg_index) > n_neg:
            disable_index = np.random.choice(
                neg_index, size=(len(neg_index) - n_neg), replace=False)
            label[disable_index] = -1

        return argmax_ious, label

    def _calc_ious(self, anchor, bbox):
        # 计算所有
        ious = bbox_iou(anchor, bbox)
        # 行是先验框，列是真实框
        argmax_ious = ious.argmax(axis=1)
        # 找出每一个先验框对应真实框最大的iou
        max_ious = ious[np.arange(len(anchor)), argmax_ious]
        # 行是先验框，列是真实框
        gt_argmax_ious = ious.argmax(axis=0)
        # 找到每一个真实框对应的先验框最大的iou
        gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])]
        # 每一个真实框对应的最大的先验框的序号
        gt_argmax_ious = np.where(ious == gt_max_ious)[0]   # 输出满足条件 (即非0) 元素的坐标

        return argmax_ious, max_ious, gt_argmax_ious


def bbox_iou(bbox_a, bbox_b):
    # IoU = (result ∩ GT) / (result ∪ GT)
    if bbox_a.shape[1] != 4 or bbox_b.shape[1] != 4:
        print(bbox_a, bbox_b)
        raise IndexError

    # tl为左上角坐标最大值,为了利用numpy的广播机制,
    # bbox_a[:,None,:2]会得到一个(N,1,2)shape的数组, bbox_b[:,:2]会得到一个(K,2)shape的数组
    # 由np的广播性质 两个数组shape都会编成(N,K,2) 也就是对a的每个box都会分别和b的每个box取左上角坐标最大值
    #       广播机制:
    #               1. NumPy 首先会比较两个数组最靠右的维度，如果最靠右的维度相等或其中一个为1，则认为此维度相等;
    #               2. 那么，再继续向左比较，如果一直满足，则认为两者兼容;
    #               3. 最后，分别在对应维度上发生广播，以此补齐直到维度一致
    tl = np.maximum(bbox_a[:, None, :2], bbox_b[:, :2])  # np.maximum() 逐位比较，选择最大值
    br = np.minimum(bbox_a[:, None, 2:], bbox_b[:, 2:])

    # 首先prod 是返回给定轴上数组元素的乘积 [N, K, 2]将变成[N, K] 将会少调最后一个轴
    # 当tl< br的时候 返回(y1max - y1min) * (xmax - xmin) 即bboxa 和bboxb 相交的区域
    # np.prod(): 计算所有元素的乘积，对于有多个维度的数组可以指定轴
    area_i = np.prod(br - tl, axis=2) * (tl < br).all(axis=2)  # all():判断给定轴向上的所有元素是否都为True,零为False,其他为True

    area_a = np.prod(bbox_a[:, 2:] - bbox_a[:, :2], axis=1)
    area_b = np.prod(bbox_b[:, 2:] - bbox_b[:, :2], axis=1)

    # 计算iou 将会是(N,K)纬度的输出,如果所有tl都大于br的话
    return area_i / (area_a[:, None] + area_b - area_i)  # [:, None]增加了一个维度, 此处为增加1维


# 已知建议框与GT BOX，求出位置偏差loc（dx,dy,dw,dh）
def bbox2loc(src_bbox, dst_bbox):
    # src_bbox: 建议框
    # dit_bbox: GT BOX
    width = src_bbox[:, 2] - src_bbox[:, 0]
    height = src_bbox[:, 3] - src_bbox[:, 1]
    ctr_x = src_bbox[:, 0] + 0.5 * width
    ctr_y = src_bbox[:, 1] + 0.5 * height

    base_width = dst_bbox[:, 2] - dst_bbox[:, 0]
    base_height = dst_bbox[:, 3] - dst_bbox[:, 1]
    base_ctr_x = dst_bbox[:, 0] + 0.5 * base_width
    base_ctr_y = dst_bbox[:, 1] + 0.5 * base_height

    eps = np.finfo(height.dtype).eps
    width = np.maximum(width, eps)
    height = np.maximum(height, eps)

    dx = (base_ctr_x - ctr_x) / width
    dy = (base_ctr_y - ctr_y) / height
    dw = np.log(base_width / width)
    dh = np.log(base_height / height)

    # 二维数组，transpose不指定参数，默认是矩阵转置
    loc = np.vstack((dx, dy, dw, dh)).transpose()  # vstack 按垂直方向（行顺序）堆叠数组构成一个新的数组

    return loc


# 8 ROI网络训练
# 产生分类和回归的ground truth，后续计算faster rcnn的loss时需要用到
class ProposalTargetCreator(object):
    def __init__(self, n_sample=128,
                 pos_ratio=0.5, pos_iou_thresh=0.5,
                 neg_iou_thresh_hi=0.5, neg_iou_thresh_lo=0.0
                 ):
        self.n_sample = n_sample
        self.pos_ratio = pos_ratio
        self.pos_iou_thresh = pos_iou_thresh
        self.neg_iou_thresh_hi = neg_iou_thresh_hi
        self.neg_iou_thresh_lo = neg_iou_thresh_lo  # NOTE:default 0.1 in py-faster-rcnn

    def __call__(self, roi, bbox, label,
                 loc_normalize_mean=(0., 0., 0., 0.),
                 loc_normalize_std=(0.1, 0.1, 0.2, 0.2)):
        n_bbox, _ = bbox.shape

        # 计算正样本
        roi = np.concatenate((roi.detach().cpu().numpy(), bbox), axis=0)
        pos_roi_per_image = np.round(self.n_sample * self.pos_ratio)
        iou = bbox_iou(roi, bbox)
        gt_assignment = iou.argmax(axis=1)
        max_iou = iou.max(axis=1)
        # 真实框的标签要+1因为有背景的存在
        gt_roi_label = label[gt_assignment] + 1

        # 找到大于门限的真实框的索引
        pos_index = np.where(max_iou >= self.pos_iou_thresh)[0]
        pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size))
        if pos_index.size > 0:
            pos_index = np.random.choice(
                pos_index, size=pos_roi_per_this_image, replace=False)

        # 正负样本的平衡，满足建议框和真实框重合程度小于neg_iou_thresh_hi大于neg_iou_thresh_lo作为负样本
        neg_index = np.where((max_iou < self.neg_iou_thresh_hi) &
                             (max_iou >= self.neg_iou_thresh_lo))[0]
        if neg_index.size > 0:
            try:
                neg_index = np.random.choice(
                    neg_index, size=self.n_sample - pos_roi_per_this_image, replace=False)
            except:
                neg_index = np.random.choice(
                    neg_index, size=self.n_sample - pos_roi_per_this_image, replace=True)

        # 取出这些框对应的标签
        keep_index = np.append(pos_index, neg_index)
        gt_roi_label = gt_roi_label[keep_index]
        gt_roi_label[pos_roi_per_this_image:] = 0
        sample_roi = roi[keep_index]

        # 找到
        gt_roi_loc = bbox2loc(sample_roi, bbox[gt_assignment[keep_index]])
        gt_roi_loc = ((gt_roi_loc - np.array(loc_normalize_mean, np.float32)
                       ) / np.array(loc_normalize_std, np.float32))

        return sample_roi, gt_roi_loc, gt_roi_label


# 9 loss 函数
from collections import namedtuple

LossTuple = namedtuple('LossTuple', ['rpn_loc_loss', 'rpn_cls_loss', 'roi_loc_loss', 'roi_cls_loss', 'total_loss'])

'''
1. 首先使用backbone网络提取输入图片的特征
2. 使用RPN网络来提取rois
3. 如果是训练，得到proposal_target，即分类和回归的ground truth，后续计算faster rcnn(6, 7)的loss时需要用到(8, 9)
4. 使用roi_pooling得到rois的feature map
5. 使用classifier提取特征
6. 使用faster_rcnn_cls得到分类结果
7. 使用faster_rcnn_reg得到回归结果
8. 如果是训练，计算分类loss
9. 如果是训练，计算回归loss
'''


class FasterRCNNTrainer(nn.Module):
    def __init__(self, faster_rcnn, optimizer):
        super(FasterRCNNTrainer, self).__init__()

        self.faster_rcnn = faster_rcnn
        self.rpn_sigma = 1
        self.roi_sigma = 1

        # 编码anchor与RPN
        self.anchor_target_creator = AnchorTargetCreator()
        self.proposal_target_creator = ProposalTargetCreator()

        self.loc_normalize_mean = [0, 0, 0, 0]
        self.loc_normalize_std = [0.1, 0.1, 0.2, 0.2]

        self.optimizer = optimizer

    def forward(self, imgs, bboxes, labels, scale):
        n = imgs.shape[0]
        img_size = imgs.shape[2:]

        # 1. 获取公用特征层
        base_feature = self.faster_rcnn.extractor(imgs)

        # 2. 利用rpn网络获得先验框的得分与调整参数
        rpn_locs, rpn_scores, rois, roi_indices, anchor = self.faster_rcnn.rpn(base_feature, img_size, scale)

        # 将所有loss值都赋值为0
        rpn_loc_loss_all, rpn_cls_loss_all, roi_loc_loss_all, roi_cls_loss_all = 0, 0, 0, 0
        for i in range(n):
            bbox = bboxes[i]
            label = labels[i]
            rpn_loc = rpn_locs[i]
            rpn_score = rpn_scores[i]
            roi = rois[roi_indices == i]
            feature = base_feature[i]

            # 3. 利用真实框和先验框获得建议框网络应该有的预测结果
            # 给每个先验框都打上标签
            # gt_rpn_loc      [num_anchors, 4]
            # gt_rpn_label    [num_anchors, ]
            gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(bbox, anchor, img_size)
            gt_rpn_loc = torch.Tensor(gt_rpn_loc)
            gt_rpn_label = torch.Tensor(gt_rpn_label).long()

            if rpn_loc.is_cuda:
                gt_rpn_loc = gt_rpn_loc.cuda()
                gt_rpn_label = gt_rpn_label.cuda()

            # 4. 分别计算建议框网络的回归损失和分类损失
            rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc, gt_rpn_label, self.rpn_sigma)
            rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label, ignore_index=-1)

            # 5. 利用真实框和建议框获得classifier网络应该有的预测结果
            # 获得三个变量，分别是sample_roi, gt_roi_loc, gt_roi_label
            # sample_roi      [n_sample, ]
            # gt_roi_loc      [n_sample, 4]
            # gt_roi_label    [n_sample, ]
            sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(roi, bbox, label,
                                                                                self.loc_normalize_mean,
                                                                                self.loc_normalize_std)

            sample_roi = torch.Tensor(sample_roi)
            gt_roi_loc = torch.Tensor(gt_roi_loc)
            gt_roi_label = torch.Tensor(gt_roi_label).long()
            sample_roi_index = torch.zeros(len(sample_roi))

            if feature.is_cuda:
                sample_roi = sample_roi.cuda()
                sample_roi_index = sample_roi_index.cuda()
                gt_roi_loc = gt_roi_loc.cuda()
                gt_roi_label = gt_roi_label.cuda()

            # 6. 获得classifier网络预测结果
            roi_cls_loc, roi_score = self.faster_rcnn.head(torch.unsqueeze(feature, 0), sample_roi, sample_roi_index,
                                                           img_size)

            # 根据建议框的种类，取出对应的回归预测结果
            n_sample = roi_cls_loc.size()[1]
            roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4)
            roi_loc = roi_cls_loc[torch.arange(0, n_sample), gt_roi_label]

            # 7. 分别计算Classifier网络的回归损失和分类损失
            roi_loc_loss = _fast_rcnn_loc_loss(roi_loc, gt_roi_loc, gt_roi_label.data, self.roi_sigma)
            roi_cls_loss = nn.CrossEntropyLoss()(roi_score[0], gt_roi_label)

            rpn_loc_loss_all += rpn_loc_loss
            rpn_cls_loss_all += rpn_cls_loss
            roi_loc_loss_all += roi_loc_loss
            roi_cls_loss_all += roi_cls_loss

        losses = [rpn_loc_loss_all / n, rpn_cls_loss_all / n, roi_loc_loss_all / n, roi_cls_loss_all / n]
        losses = losses + [sum(losses)]
        return LossTuple(*losses)

    def train_step(self, imgs, bboxes, labels, scale):
        self.optimizer.zero_grad()
        losses = self.forward(imgs, bboxes, labels, scale)
        losses.total_loss.backward()
        self.optimizer.step()
        return losses


def _smooth_l1_loss(x, t, sigma):
    sigma_squared = sigma ** 2
    regression_diff = (x - t)
    regression_diff = regression_diff.abs()
    regression_loss = torch.where(
        regression_diff < (1. / sigma_squared),
        0.5 * sigma_squared * regression_diff ** 2,
        regression_diff - 0.5 / sigma_squared
    )
    return regression_loss.sum()


def _fast_rcnn_loc_loss(pred_loc, gt_loc, gt_label, sigma):
    pred_loc = pred_loc[gt_label > 0]
    gt_loc = gt_loc[gt_label > 0]

    loc_loss = _smooth_l1_loss(pred_loc, gt_loc, sigma)
    num_pos = (gt_label > 0).sum().float()
    loc_loss /= torch.max(num_pos, torch.ones_like(num_pos))
    return loc_loss


# 10 Faster R-CNN训练
from tqdm import tqdm
from torch.utils.data.dataset import Dataset
from PIL import Image
import cv2
from torch.utils.data import DataLoader


def fit_ont_epoch(net, epoch, epoch_size, epoch_size_val, gen, genval, Epoch, cuda):
    total_loss = 0
    rpn_loc_loss = 0
    rpn_cls_loss = 0
    roi_loc_loss = 0
    roi_cls_loss = 0
    val_toal_loss = 0

    with tqdm(total=epoch_size, desc=f'Epoch {epoch + 1}/{Epoch}', postfix=dict, mininterval=0.3) as pbar:

        for iteration, batch in enumerate(gen):
            if iteration >= epoch_size:
                break
            imgs, boxes, labels = batch[0], batch[1], batch[2]
            with torch.no_grad():
                if cuda:
                    imgs = Variable(torch.from_numpy(imgs).type(torch.FloatTensor)).cuda()
                else:
                    imgs = Variable(torch.from_numpy(imgs).type(torch.FloatTensor))
            losses = train_util.train_step(imgs, boxes, labels, 1)
            rpn_loc, rpn_cls, roi_loc, roi_cls, total = losses
            total_loss += total.item()
            rpn_loc_loss += rpn_loc.item()
            rpn_cls_loss += rpn_cls.item()
            roi_loc_loss += roi_loc.item()
            roi_cls_loss += roi_cls.item()

            pbar.set_postfix(**{'total': total_loss / (iteration + 1),
                                'rpn_loc': rpn_loc_loss / (iteration + 1),
                                'rpn_cls': rpn_cls_loss / (iteration + 1),
                                'roi_loc': roi_loc_loss / (iteration + 1),
                                'roi_cls': roi_cls_loss / (iteration + 1),
                                'lr': get_lr(optimizer)})
            pbar.update(1)

    print('Start Validation')
    with tqdm(total=epoch_size, desc=f'Epoch {epoch + 1}/{Epoch}', postfix=dict, mininterval=0.3) as pbar:
        for iteration, batch in enumerate(genval):
            if iteration >= epoch_size:
                break
            imgs, boxes, labels = batch[0], batch[1], batch[2]
            with torch.no_grad():
                if cuda:
                    imgs = Variable(torch.from_numpy(imgs).type(torch.FloatTensor)).cuda()
                else:
                    imgs = Variable(torch.from_numpy(imgs).type(torch.FloatTensor))

                train_util.optimizer.zero_grad()
                losses = train_util.forward(imgs, boxes, labels, 1)
                _, _, _, _, val_total = losses

                val_toal_loss += val_total.item()
            pbar.set_postfix(**{'total_loss': val_toal_loss / (iteration + 1)})
            pbar.update(1)
    print('Finish Validation')
    print('Epoch:' + str(epoch + 1) + '/' + str(Epoch))
    print('Total Loss: {} || Val Loss: {}'.format(total_loss / (epoch_size + 1),
                                                  val_toal_loss / (epoch_size_val + 1)))
    print('Saving state, iter:', str(epoch + 1))
    torch.save(net.state_dict(),
               'vocdataset/Epoch{}-Total_Loss{}-Val_Loss{}.pth'.format(epoch + 1, total_loss / (epoch_size + 1),
                                                                       val_toal_loss / (epoch_size + 1)))


class FasterRCNN(nn.Module):
    def __init__(self, num_classes, mode="training", feat_stride=16, anchor_scales=[8, 16, 32], ratios=[0.5, 1, 2],
                 backbone='resnet50'):
        super(FasterRCNN, self).__init__()
        self.feat_stride = feat_stride
        if backbone == "resnet50":
            self.extractor, classifier = resnet50()
            self.rpn = RegionProposalNet(
                in_channels=1024,
                mid_channels=512,
                ratios=ratios,
                anchor_scales=anchor_scales,
                feat_stride=feat_stride,
                mode=mode
            )
            self.head = Resnet50RoIHead(
                n_class=num_classes + 1,
                roi_size=14,
                spatial_scale=1,
                classifier=classifier
            )

    def forward(self, x, scale=1.):
        img_size = x.shape[2:]
        base_feature = self.extractor(x)
        _, _, rois, roi_indices, _ = self.rpn(base_feature, img_size, scale)
        roi_cls_locs, roi_scores = self.head(base_feature, rois, roi_indices, img_size)
        return roi_cls_locs, roi_scores, rois, roi_indices

    def freeze_bn(self):
        for m in self.modules():
            if isinstance(m, nn.BatchNorm2d):
                m.eval()
                # 不启用 BatchNormalization 和 Dropout，保证BN和dropout不发生变化，
                # pytorch框架会自动把BN和Dropout固定住，不会取平均，而是用训练好的值，
                # 不然的话，一旦test的batch_size过小，很容易就会被BN层影响结果


class FRCNNDataset(Dataset):
    def __init__(self, train_lines, shape=[600, 600], is_train=True):
        self.train_lines = train_lines
        self.train_batches = len(train_lines)
        self.shape = shape
        self.is_train = is_train

    def __len__(self):
        return self.train_batches

    def rand(self, a=0, b=1):
        return np.random.rand() * (b - a) + a

    def get_random_data(self, annotation_line, jitter=.3, hue=.1, sat=1.5, val=1.5, random=True):

        line = annotation_line.split()
        image = Image.open(line[0])
        iw, ih = image.size
        h, w = self.shape
        box = np.array([np.array(list(map(int, box.split(',')))) for box in line[1:]])

        if not random:
            # resize image
            scale = min(w / iw, h / ih)
            nw = int(iw * scale)
            nh = int(ih * scale)
            dx = (w - nw) // 2
            dy = (h - nh) // 2

            image = image.resize((nw, nh), Image.BICUBIC)
            new_image = Image.new('RGB', (w, h), (128, 128, 128))
            new_image.paste(image, (dx, dy))
            image_data = np.array(new_image, np.float32)

            # correct boxes
            box_data = np.zeros((len(box), 5))
            if len(box) > 0:
                np.random.shuffle(box)
                box[:, [0, 2]] = box[:, [0, 2]] * nw / iw + dx
                box[:, [1, 3]] = box[:, [1, 3]] * nh / ih + dy
                box[:, 0:2][box[:, 0:2] < 0] = 0
                box[:, 2][box[:, 2] > w] = w
                box[:, 3][box[:, 3] > h] = h
                box_w = box[:, 2] - box[:, 0]
                box_h = box[:, 3] - box[:, 1]
                box = box[np.logical_and(box_w > 1, box_h > 1)]
                box_data = np.zeros((len(box), 5))
                box_data[:len(box)] = box

            return image_data, box_data

        # resize image
        new_ar = w / h * self.rand(1 - jitter, 1 + jitter) / self.rand(1 - jitter, 1 + jitter)
        scale = self.rand(.5, 1.5)
        if new_ar < 1:
            nh = int(scale * h)
            nw = int(nh * new_ar)
        else:
            nw = int(scale * w)
            nh = int(nw / new_ar)
        image = image.resize((nw, nh), Image.BICUBIC)

        # place image
        dx = int(self.rand(0, w - nw))
        dy = int(self.rand(0, h - nh))
        new_image = Image.new('RGB', (w, h), (128, 128, 128))
        new_image.paste(image, (dx, dy))
        image = new_image

        # flip image or not
        flip = self.rand() < .5
        if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT)

        # distort image
        hue = self.rand(-hue, hue)
        sat = self.rand(1, sat) if self.rand() < .5 else 1 / self.rand(1, sat)
        val = self.rand(1, val) if self.rand() < .5 else 1 / self.rand(1, val)
        x = cv2.cvtColor(np.array(image, np.float32) / 255, cv2.COLOR_RGB2HSV)
        x[..., 0] += hue * 360
        x[..., 0][x[..., 0] > 1] -= 1
        x[..., 0][x[..., 0] < 0] += 1
        x[..., 1] *= sat
        x[..., 2] *= val
        x[x[:, :, 0] > 360, 0] = 360
        x[:, :, 1:][x[:, :, 1:] > 1] = 1
        x[x < 0] = 0
        image_data = cv2.cvtColor(x, cv2.COLOR_HSV2RGB) * 255  # numpy array, 0 to 1

        # correct boxes
        box_data = np.zeros((len(box), 5))
        if len(box) > 0:
            np.random.shuffle(box)
            box[:, [0, 2]] = box[:, [0, 2]] * nw / iw + dx
            box[:, [1, 3]] = box[:, [1, 3]] * nh / ih + dy
            if flip: box[:, [0, 2]] = w - box[:, [2, 0]]
            box[:, 0:2][box[:, 0:2] < 0] = 0
            box[:, 2][box[:, 2] > w] = w
            box[:, 3][box[:, 3] > h] = h
            box_w = box[:, 2] - box[:, 0]
            box_h = box[:, 3] - box[:, 1]
            box = box[np.logical_and(box_w > 1, box_h > 1)]  # discard invalid box
            box_data = np.zeros((len(box), 5))
            box_data[:len(box)] = box

        return image_data, box_data

    def __getitem__(self, index):
        img, y = self.get_random_data(self.train_lines[index], random=self.is_train)
        img = np.transpose(img / 255.0, [2, 0, 1])
        box = y[:, :4]
        label = y[:, -1]
        return img, box, label


# DataLoader中collate_fn使用
def frcnn_dataset_collate(batch):
    images = []
    bboxes = []
    labels = []
    for img, box, label in batch:
        images.append(img)
        bboxes.append(box)
        labels.append(label)
    images = np.array(images)
    return images, bboxes, labels


def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']


# 11 数据集划分
import os
import random
import xml.etree.ElementTree as ET

random.seed(1)


def data_ide(xmlfilepath, saveBasePath, trainval_perscent=1, train_persent=1, num=None):
    temp_xml = os.listdir(xmlfilepath)  # 返回指定路径下的文件和文件夹列表
    total_xml = []
    for xml in temp_xml:
        if xml.endswith(".xml"):
            total_xml.append(xml)

    if num == None:
        num = len(total_xml)
    list = range(num)
    tv = int(num * trainval_perscent)
    tr = int(tv * train_persent)
    trainval = random.sample(list, tv)
    train = random.sample(trainval, tr)

    ftrainval = open(os.path.join(saveBasePath, 'trainval.txt'), 'w')
    ftest = open(os.path.join(saveBasePath, 'test.txt'), 'w')
    ftrain = open(os.path.join(saveBasePath, 'train.txt'), 'w')
    fval = open(os.path.join(saveBasePath, 'val.txt'), 'w')
    for i in list:
        if i < len(total_xml):
            name = total_xml[i][:-4] + '\n'
            if i in trainval:
                ftrainval.write(name)
                if i in train:
                    ftrain.write(name)
                else:
                    fval.write(name)
            else:
                ftest.write(name)
        else:
            print("error: list out of range, please decrease 'num'")

    ftrainval.close()
    ftrain.close()
    fval.close()
    ftest.close()


def convert_annotation(xmlfilepath, image_id, list_file, classes):
    in_file = open(os.path.join(xmlfilepath, str(image_id) + '.xml'), encoding='utf-8')
    tree = ET.parse(in_file)
    root = tree.getroot()
    # print(image_id)
    for obj in root.iter('object'):
        difficult = 0
        if obj.find('difficult') != None:
            difficult = obj.find('difficult').text
        cls = obj.find('name').text
        if cls not in classes or int(difficult) == 1:
            continue
            # classes.append(cls)
        cls_id = classes.index(cls)
        xmlbox = obj.find('bndbox')
        b = (int(float(xmlbox.find('xmin').text)),
             int(float(xmlbox.find('ymin').text)),
             int(float(xmlbox.find('xmax').text)),
             int(float(xmlbox.find('ymax').text)))
        list_file.write(" " + ",".join([str(a) for a in b]) + "," + str(cls_id))
        # print(classes)

def voc_annotation(sets, classes, imagePath, BasePath, xmlfilepath):
    # wd = os.getcwd()  # 返回当前工作目录
    for year, image_set in sets:
        image_ids = open(os.path.join(BasePath, str(image_set) + '.txt'), encoding='utf-8').read().strip().split()
        list_file = open(os.path.join(BasePath, '{}_{}.txt'.format(year, image_set)), 'w', encoding='utf-8')
        for image_id in image_ids:
            list_file.write(os.path.join(imagePath, str(image_id) + '.jpg'))
            convert_annotation(xmlfilepath, image_id, list_file, classes)
            list_file.write('\n')
        list_file.close()


if __name__ == '__main__':
    xmlfilepath = '/VOC2007/Annotations/'
    imagePath = '/VOC2007/JPEGImages/'
    saveBasePath = 'vocdataset/'
    trainval_perscent = 0.9
    train_persent = 0.8
    num_data = None
    sets = [('tianjin_day', 'train'), ('tianjin_day', 'val'), ('tianjin_day', 'test')]
    classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
    # data_ide(xmlfilepath, saveBasePath, trainval_perscent, train_persent, num_data)
    # voc_annotation(sets, classes, imagePath, saveBasePath, xmlfilepath)

    import torch.backends.cudnn as cudnn
    import torch.optim as optim

    Cuda = False
    NUM_CLASSES = len(classes)
    input_shape = [800, 800, 3]
    backbone = "resnet50"
    annotation_path_train = os.path.join(saveBasePath, '2007_train.txt')
    annotation_path_val = os.path.join(saveBasePath, '2007_day_val.txt')
    annotation_path_test = os.path.join(saveBasePath, '2007_day_test.txt')
    
    #加载预训练主干网络
    model_path = os.path.join(saveBasePath, 'voc_weights_resnet.pth')

    model = FasterRCNN(NUM_CLASSES, backbone=backbone)
    print('Loading weights into state dict...')
    device = torch.device('cuda' if (torch.cuda.is_available() and Cuda) else 'cpu')
    model_dict = model.state_dict()
    pretrained_dict = torch.load(model_path, map_location=device)
    pretrained_dict = {k: v for k, v in pretrained_dict.items() if np.shape(model_dict[k]) == np.shape(v)}
    model_dict.update(pretrained_dict)
    model.load_state_dict(model_dict)
    print('Finished!')

    net = model.train()
    if Cuda:
        net = torch.nn.DataParallel(model)
        cudnn.benchmark = True
        net = net.cuda()

    with open(annotation_path_train) as f, open(annotation_path_val) as f_v, open(annotation_path_test) as f_t:
        lines_train = f.readlines()
        lines_val = f_v.readlines()
        lines_test = f_t.readlines()
    np.random.seed(1)
    np.random.shuffle(lines_train)
    np.random.shuffle(lines_val)
    np.random.shuffle(lines_test)
    np.random.seed(None)
    
    num_train = len(lines_train)
    num_val = len(lines_val)
    train_file = lines_train[:]
    val_file = lines_val[:]
    test_file = lines_test[:]

    # 主干特征提取网络特征通用，冻结训练可以加快训练速度, 也可以在训练初期防止权值被破坏.
    # Init_Epoch为起始世代
    # Freeze_Epoch为冻结训练的世代
    # Epoch总训练世代
    if True:
        lr = 1e-4
        Batch_size = 2
        Init_Epoch = 0
        Freeze_Epoch = 50

        optimizer = optim.Adam(net.parameters(), lr, weight_decay=5e-4)
        lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)

        train_dataset = FRCNNDataset(train_file, (input_shape[0], input_shape[1]), is_train=True)
        val_dataset = FRCNNDataset(val_file, (input_shape[0], input_shape[1]), is_train=False)
        gen = DataLoader(train_dataset, shuffle=True, batch_size=Batch_size, num_workers=4, pin_memory=False,
                         drop_last=True, collate_fn=frcnn_dataset_collate)
        gen_val = DataLoader(val_dataset, shuffle=True, batch_size=Batch_size, num_workers=4, pin_memory=False,
                             drop_last=True, collate_fn=frcnn_dataset_collate)

        epoch_size = num_train // Batch_size
        epoch_size_val = num_val // Batch_size

        # 冻结一部份训练
        for param in model.extractor.parameters():
            param.requires_grad = False

        # 冻结BN 层
        model.freeze_bn()

        train_util = FasterRCNNTrainer(model, optimizer)
        # print('Starting to train with freeze BN...')
        for epoch in range(Init_Epoch, Freeze_Epoch):
            fit_ont_epoch(net, epoch, epoch_size, epoch_size_val, gen, gen_val, Freeze_Epoch, Cuda)
            lr_scheduler.step()

    if True:
        lr = 1e-5
        Batch_size = Batch_size
        Freeze_Epoch = 50
        Unfreeze_Epoch = 100

        optimizer = optim.Adam(net.parameters(), lr, weight_decay=5e-4)
        lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)

        train_dataset = FRCNNDataset(train_file, (input_shape[0], input_shape[1]), is_train=True)
        val_dataset = FRCNNDataset(val_file, (input_shape[0], input_shape[1]), is_train=False)
        gen = DataLoader(train_dataset, shuffle=True, batch_size=Batch_size, num_workers=4, pin_memory=False,
                         drop_last=True, collate_fn=frcnn_dataset_collate)
        gen_val = DataLoader(val_dataset, shuffle=True, batch_size=Batch_size, num_workers=4, pin_memory=False,
                             drop_last=True, collate_fn=frcnn_dataset_collate)

        epoch_size = num_train // Batch_size
        epoch_size_val = num_val // Batch_size

        # 解冻后训练
        for param in model.extractor.parameters():
            param.requires_grad = True

        # 冻结BN 层
        model.freeze_bn()

        for epoch in range(Freeze_Epoch, Unfreeze_Epoch, ):
            fit_ont_epoch(net, epoch, epoch_size, epoch_size_val, gen, gen_val, Unfreeze_Epoch, Cuda)
            lr_scheduler.step()
Likw也被注册了
关注
0
点赞
踩
5

收藏

觉得还不错? 一键收藏
0
评论
Pytorch 实现Faster RCNN（未完）

import pdb # Debug工具import mathimport matplotlib.pyplot as pltimport numpy as npimport torchimport torch.nn as nnimport torch.nn.functional as Fimport torch.utils.model_zoo as model_zoofrom torch.autograd import Variabletorch.manual_seed(1)''.
复制链接

扫一扫