训练部分
数据集:VOC2007
预训练主干网络,自行下载即可
import pdb # Debug工具
import math
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.model_zoo as model_zoo
from torch.autograd import Variable
torch.manual_seed(1)
'''
输入图片->主干网络获得共享特征层->共享特征层获得建议框->建议框解码获得截取的位置->特征层截取->resize和conv->获得最终预测结果并解码
Conv Block 输入和输出的维度是不一样的,不能连续串联,用于改变网络的维度
Identity Block 输入维度和输出维度相同,可以串联,用于加深网络的
'''
# 1. 主干网络 Conv layers: feature maps被共享用于后续RPN层(RegionProposalNet) 和全连接层(Resnet50RoIHead)
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False)
self.bn1 = nn.BatchNorm2d(planes) # 数据的归一化处理, 使得数据在进行Relu之前不会因为数据过大而导致网络性能的不稳定. p:特征的数量
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes * 4)
self.relu = nn.ReLU(inplace=True) # True 将会改变输入的数据 ,Flase 不会改变原输入,只会产生新的输出
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class Resnet(nn.Module):
def __init__(self, block, layers, num_classes=1000):
# image = [600, 600, 3]
self.inplanes = 64
super(Resnet, self).__init__()
# [600, 600, 3] -> [300, 300, 64]
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
# [300, 300, 64] -> [150, 150, 64]
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=0, ceil_mode=True) # True,向上取整
# [150, 150, 64] -> [150, 150, 256]
self.layer1 = self._make_layer(block, 64, layers[0])
# [150,150,256] -> [75,75,512]
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
# [75,75,512] -> [38,38,1024] 到这里可以获得一个[38,38,1024]的共享特征层
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
# self.layer4被用在classifier模型中
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.avgpool = nn.AvgPool2d(7)
self.fc = nn.Linear(512 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
# 当模型需要进行高和宽的压缩的时候,就需要用到残差边的downsample
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
def resnet50():
model = Resnet(Bottleneck, [3, 4, 6, 3])
# 获取特征提取部分,从 conv1 到 model.layer3,最终获得一个38,38,1024的特征层
features = list([model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2, model.layer3])
# 获取分类部分,从model.layer4 到 model.avgpool
classifier = list([model.layer4, model.avgpool])
features = nn.Sequential(*features)
classifier = nn.Sequential(*classifier)
# features 为公用特征层,classifier 为 第二阶段分类器
return features, classifier
# 2.1 Proposal建议框, 生成 anchor
def generate_anchor_base(base_size=16, ratios=[0.5, 1, 2], anchor_scales=[8, 16, 32]):
anchor_base = np.zeros((len(ratios) * len(anchor_scales), 4), dtype=np.float32)
# print(anchor_base)
for i in range(len(ratios)):
for j in range(len(anchor_scales)):
h = base_size * anchor_scales[j] * np.sqrt(ratios[i])
w = base_size * anchor_scales[j] * np.sqrt(1. / ratios[i])
index = i * len(anchor_scales) + j
anchor_base[index, 0] = - h / 2.
anchor_base[index, 1] = - w / 2.
anchor_base[index, 2] = h / 2.
anchor_base[index, 3] = w / 2.
# print(anchor_base)
return anchor_base
def _enumerate_shifted_anchor(anchor_base, feat_stride, height, width):
# 1. 在原图上生成anchor 的过程, 假设height = 4, width = 4,则K = 4*4 = 16
# 计算网格中心点
# width = 4, shift_x = [0, 16, 32, 48]
shift_x = np.arange(0, width * feat_stride, feat_stride)
# height = 4, shift_y = [0, 16, 32, 48]
shift_y = np.arange(0, height * feat_stride, feat_stride)
# 生成网格 shift_x = [ [0, 16, 32, 48], [0, 16, 32, 48], [0, 16, 32, 48], [0, 16, 32, 48] ]
# 生成网格 shift_y = [ [0, 0, 0, 0], [16, 16, 16, 16], [32, 32, 32, 32], [48, 48, 48, 48] ]
shift_x, shift_y = np.meshgrid(shift_x, shift_y) # 生成网格点坐标矩阵
# ravel() 将数组维度拉成一维数组, stack 沿着新轴连接数组的序列
# shift.T = [ [0, 16, 32, 48, 0, 16, 32, 48, 0, 16, 32, 48, 0, 16, 32, 48],
# [0, 0, 0, 0, 16, 16, 16, 16, 32, 32, 32, 32, 48, 48, 48, 48],
# [0, 16, 32, 48, 0, 16, 32, 48, 0, 16, 32, 48, 0, 16, 32, 48],
# [0, 0, 0, 0, 16, 16, 16, 16, 32, 32, 32, 32, 48, 48, 48, 48] ]
# shift = np.stack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel(),), axis=1)
shift = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel(),)).transpose()
# 2. 生成base anchor
# 每个网格点上的9个先验框 shape: [9, 4]
A = anchor_base.shape[0]
# K表示 feature map 有多少个像素点 4*4 = 16
K = shift.shape[0]
# 3. 把9 个anchors 分别移动到shifts 所给出的K(16) 个位置上
# 将anchor 左上点的坐标和shifts 的前两列加起来,右下点的坐标和shifts 的后两列加起来
# 因为shifts 的前两列和后两列相同,意味着anchor 的左上点和右下点都平移了相同的距离
anchor = anchor_base.reshape((1, A, 4)) + shift.reshape((K, 1, 4))
# 所有的先验框, 每幅图共生成 K * A 个anchor,用4 个坐标表示
anchor = anchor.reshape((K * A, 4)).astype(np.float32)
return anchor
# 2.2 Proposal建议框, Region Proposal Networks: 用于生成region proposals。该层通过softmax判断anchors属于positive或者negative,
# 再利用bounding box regression修正anchors获得精确的proposals(rois)
'''
1. 对于输入的feature map先用rpn_conv进行卷积
2. 然后使用rpn_cls卷积层得到分类结果
3. 同时使用rpn_reg卷积层得到回归结果
4. 然后之后再调用proposal函数得到proposals(rois)
5. 如果是训练过程,那么使用调用anchor_target产生rpn网络中分类和回归的ground truth值,之后在计算rpn的loss时会用到
6. 如果是训练过程,那么计算分类loss
7. 如果是训练过程,那么计算回归loss
'''
class RegionProposalNet(nn.Module):
def __init__(self, in_channels=512, mid_channels=512, ratios=[0.5, 1, 2], anchor_scales=[8, 16, 32],
feat_stride=16, mode='training'):
super(RegionProposalNet, self).__init__()
# 步长,压缩的倍数
self.feat_stride = feat_stride
self.proposal_layer = ProposalCreator(mode)
# 生成基础先验框, shape[9, 4]
self.anchor_base = generate_anchor_base(anchor_scales=anchor_scales, ratios=ratios)
n_anchor = self.anchor_base.shape[0]
# [38,38,1024] -> [38,38,512]
# 先进行一个 3*3 的卷积 -> 特征整合
self.conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1)
# [38,38,512] -> [38,38,18]
# 分类预测先验框内部是否包含物体
self.score = nn.Conv2d(mid_channels, n_anchor * 2, 1, 1, 0)
# [38,38,512] -> [38,38,36]
# 回归预测对先验框进行调整
self.loc = nn.Conv2d(mid_channels, n_anchor * 4, 1, 1, 0)
# 对FPN的网络部分进行权值初始化
normal_init(self.conv1, 0, 0.01)
normal_init(self.score, 0, 0.01)
normal_init(self.loc, 0, 0.01)
def forward(self, x, img_size, scale=1.):
n, c, h, w = x.shape
# print('fearture_size_229:', x.shape)
# 先进行一个3 * 3 的卷积 -> 特征整合
x = self.conv1(x)
x = F.relu(x)
# 1. Conv 回归预测对先验框进行调整, rpn_locs: 先验框的调整参数
rpn_locs = self.loc(x)
# permute: 维度换位, 将通道调整到最后一个维度
# contiguous: 返回一个内存连续的有相同数据的tensor,如果原tensor内存连续,则返回原tensor
# 2. Reshape [n, 38×38×9, 4]
rpn_locs = rpn_locs.permute(0, 2, 3, 1).contiguous().view(n, -1, 4)
# 1. Conv 分类预测先验框内部是否包含物体, rpn_scores: 先验框的得分
rpn_scores = self.score(x)
# 2. Reshape [n, 38×38×9, 2]
rpn_scores = rpn_scores.permute(0, 2, 3, 1).contiguous().view(n, -1, 2)
# view() 把原先tensor中的数据按照行优先的顺序排成一个一维的数据(应该是因为要求地址是连续存储的),然后按照参数组合成其他维度的tensor
# 此处第0维度变成 batch_size,第1维度变成了先验框,最后一个维度为判断先验框是否包含物体
# 3. Softmax 概率计算,每个先验框只有两个判别结果
# 内部包含物体或者内部不包含物体,rpn_softmax_scores[:, :, 1]的内容为包含物体的概率
rpn_softmax_scores = F.softmax(rpn_scores, dim=-1) # -1: 行和为1
# 4. Reshape [n, 38×38×9]
rpn_fg_scores = rpn_softmax_scores[:, :, 1].contiguous().view(n, -1)
# rpn_fg_scores = rpn_fg_scores.view(n, -1)
# 生成先验框,此时获得的anchor是布满网格点的,当输入图片为600,600,3的时候,shape为(12996, 4)
anchor = _enumerate_shifted_anchor(np.array(self.anchor_base), self.feat_stride, h, w)
# print('anchor_258:', anchor[0:10])
rois = list()
roi_indices = list()
for i in range(n):
roi = self.proposal_layer(rpn_locs[i], rpn_fg_scores[i], anchor, img_size, scale=scale)
batch_index = i * torch.ones((len(roi),))
rois.append(roi)
roi_indices.append(batch_index)
rois = torch.cat(rois, dim=0)
roi_indices = torch.cat(roi_indices, dim=0) # 列
return rpn_locs, rpn_scores, rois, roi_indices, anchor
def normal_init(m, mean, stddev, truncated=False):
if truncated:
m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean) # # not a perfect approximation
# fmod_: 给出除数除以元素的余数 mul_: 应位相乘 add_: 两个张量相加 _: in-place 操作
else:
m.weight.data.normal_(mean=mean, std=stddev)
m.bias.data.zero_()
# 3 Proposal建议框的解码
from torchvision.ops import nms
class ProposalCreator():
# 通过nms(极大值抑制)去除重复率高的anchors,在剩下的anchors 选取前600/300个anchors作为建议框输出
def __init__(self, mode, nms_thresh=0.7,
n_train_pre_nms=12000,
n_train_post_nms=600,
n_test_pre_nms=3000,
n_test_post_nms=300,
min_size=16):
self.mode = mode
self.nms_thresh = nms_thresh
self.n_train_pre_nms = n_train_pre_nms
self.n_train_post_nms = n_train_post_nms
self.n_test_pre_nms = n_test_pre_nms
self.n_test_post_nms = n_test_post_nms
self.min_size = min_size
def __call__(self, loc, score, anchor, img_size, scale=1.):
if self.mode == "training":
n_pre_nms = self.n_train_pre_nms
n_post_nms = self.n_train_post_nms
else:
n_pre_nms = self.n_test_pre_nms
n_post_nms = self.n_test_post_nms
anchor = torch.from_numpy(anchor)
if loc.is_cuda:
anchor = anchor.cuda()
# 将RPN 网络预测结果 转换为 建议框
# [38×38×9, 4]
roi = loc2bbox(anchor, loc)
# print('roi_316:', roi.size(), roi[:, :][(roi[:, 2] - roi[:, 0]) != 0].size())
# 防止建议框超出图像边缘
roi[:, [0, 2]] = torch.clamp(roi[:, [0, 2]], min=0, max=img_size[1]) # 每个元素的范围限制到区间 [min,max]
roi[:, [1, 3]] = torch.clamp(roi[:, [1, 3]], min=0, max=img_size[0])
# print('roi_323:', roi.size(), roi[:, :][(roi[:, 2] - roi[:, 0]) != 0].size())
# 建议框的宽高的最小值≥ 16
min_size = self.min_size * scale
# 计算宽高
ws = roi[:, 2] - roi[:, 0]
hs = roi[:, 3] - roi[:, 1]
# 防止建议框过小
keep = torch.where((ws >= min_size) & (hs >= min_size))[0] # 从x,y中选择元素所组成的张量
# print('keep_332:', keep, 'min_size:', min_size,
# 'ws:', ws[ws >= min_size].size(),
# 'hs:', hs[hs >= min_size].size(),
# 'hs:', max(hs))
roi = roi[keep, :]
# print('roi_333:', roi.size())
# score [38×38×9]
score = score[keep]
# 根据得分排序,取出建议框
order = torch.argsort(score, descending=True) # 返回一个排序好的列表值的索引
if n_pre_nms > 0:
order = order[:n_pre_nms]
roi = roi[order, :]
score = score[order]
# 对建议框进行非极大抑制, 防止一定区域内的建议框过多,将一定区域内得分最高的建议框取出
keep = nms(roi, score, self.nms_thresh) # keep :NMS过滤后的bouding boxes索引(降序排列)
keep = keep[:n_post_nms]
roi = roi[keep]
return roi
def loc2bbox(src_bbox, loc):
# src_bbox: 先验框 loc: 建议框网络的预测结果
# 通过anchor的左上角(xmin,ymin)和右下角坐标(xmax,ymax),和anchor宽w、高h,和偏移量(dx,dy,dw,dh),
# 通过公式求出建议框的左上角与右下角坐标集合dst_bbox(xl,yl,xr,yr)
if src_bbox.size()[0] == 0:
return torch.zeros((0, 4), dtype=loc.dtype)
# 先验框的宽高
src_width = torch.unsqueeze(src_bbox[:, 2] - src_bbox[:, 0], -1) # 对输入的既定位置插入维度1,若dim为负,则将会被转化dim+input.dim()+1
src_height = torch.unsqueeze(src_bbox[:, 3] - src_bbox[:, 1], -1)
# 先验框的中心
src_ctr_x = torch.unsqueeze(src_bbox[:, 0], -1) + 0.5 * src_width
src_ctr_y = torch.unsqueeze(src_bbox[:, 1], -1) + 0.5 * src_height
dx = loc[:, 0::4]
dy = loc[:, 1::4]
dw = loc[:, 2::4]
dh = loc[:, 3::4]
# 调整后的 先验框的宽高
ctr_x = dx * src_width + src_ctr_x
ctr_y = dy * src_height + src_ctr_y
# 调整后的 先验框的中心
w = torch.exp(dw) * src_width
h = torch.exp(dh) * src_height
# [38×38×9, 4]
dst_bbox = torch.zeros_like(loc)
# 格式转变: 左上角右下角
dst_bbox[:, 0::4] = ctr_x - 0.5 * w
dst_bbox[:, 1::4] = ctr_y - 0.5 * h
dst_bbox[:, 2::4] = ctr_x + 0.5 * w
dst_bbox[:, 3::4] = ctr_y + 0.5 * h
# 调整后的先验框,即 尚未经过筛选的建议框
return dst_bbox
# 4. 对Proposal 建议框加以利用(RoiPoolingConv): 收集输入的feature maps和proposals,综合这些信息后提取proposal feature maps,
# 送入后续全连接层,利用proposal feature maps计算proposal的类别,
# 同时再次bounding box regression获得检测框最终的精确位置
# RoI Pooling: 对非均匀尺寸的输入执行最大池化以获得固定尺寸的特征图
from torchvision.ops import RoIPool
class Resnet50RoIHead(nn.Module):
def __init__(self, n_class, roi_size, spatial_scale, classifier):
# n_class includes the background
super(Resnet50RoIHead, self).__init__()
# 获得用于分类的层
self.classifier = classifier
# 对ROIPooling后的的结果进行回归预测
self.cls_loc = nn.Linear(2048, n_class * 4)
# 对ROIPooling后的的结果进行分类
self.score = nn.Linear(2048, n_class)
normal_init(self.cls_loc, 0, 0.001)
normal_init(self.score, 0, 0.01)
# 分多少个类,包括背景
self.n_class = n_class
# 以VGG为backbone时,roi_size = 7
self.roi_size = roi_size
self.spatial_scale = spatial_scale
self.roi = RoIPool((self.roi_size, self.roi_size), self.spatial_scale)
def forward(self, x, rois, roi_indices, img_size):
n, _, _, _ = x.shape
if x.is_cuda:
roi_indices = torch.Tensor(roi_indices).cuda().float()
rois = torch.Tensor(rois).cuda().float()
else:
roi_indices = torch.Tensor(roi_indices).float()
rois = torch.Tensor(rois).float()
indices_and_rois = torch.cat([roi_indices[:, None], rois], dim=1) # [index, x1, y1, x2, y2]
xy_indices_and_rois = indices_and_rois[:, [0, 1, 2, 3, 4]] # # [index, x1, y1, x2, y2]
indices_and_rois = xy_indices_and_rois.contiguous()
# 利用建议框对公用特征进行截取
pool = self.roi(x, indices_and_rois)
# [300, 2048, 1, 1]
fc7 = self.classifier(pool)
# [300, 2048]
fc7 = fc7.view(fc7.size(0), -1)
roi_cls_locs = self.cls_loc(fc7)
roi_scores = self.score(fc7)
roi_cls_locs = roi_cls_locs.view(n, -1, roi_cls_locs.size(1))
roi_scores = roi_scores.view(n, -1, roi_scores.size(1))
return roi_cls_locs, roi_scores
# 7. 建议框网络训练
# 编码 产生rpn网络中分类和回归的ground truth值,之后在计算rpn的loss时会用到
class AnchorTargetCreator(object):
def __init__(self, n_sample=256, pos_iou_thresh=0.7, neg_iou_thresh=0.3, pos_ratio=0.5):
self.n_sample = n_sample
self.pos_iou_thresh = pos_iou_thresh
self.neg_iou_thresh = neg_iou_thresh
self.pos_ratio = pos_ratio
def __call__(self, bbox, anchor, img_size):
argmax_ious, label = self._create_label(anchor, bbox)
# 利用先验框和其对应的真实框进行编码
loc = bbox2loc(anchor, bbox[argmax_ious])
return loc, label
def _create_label(self, anchor, bbox):
# 1是正样本, 0是负样本, -1忽略
label = np.empty((len(anchor),), dtype=np.int32) # 根据给定的维度和数值类型返回一个新的数组,其元素不进行初始化
label.fill(-1) # 返回给定维度和数值类型的新数组,填充了 fill_value
# argmax_ious 为每个先验框对应的最大的真实框的序号
# max_ious 为每个先验框对应的最大的真实框的iou
# gt_argmax_ious 为每一个真实框对应的最大的先验框的序号
argmax_ious, max_ious, gt_argmax_ious = self._calc_ious(anchor, bbox)
# 如果小于门限函数则设置为负样本
label[max_ious < self.neg_iou_thresh] = 0
# 每个真实框至少对应一个先验框,即当某个先验框和真实框有最大IoU值,那该anchor被认为是正样本
label[gt_argmax_ious] = 1
# 如果大于门限函数则设置为正样本
label[max_ious >= self.pos_iou_thresh] = 1
# 判断正样本数量是否大于128,如果大于的话则去掉一些
n_pos = int(self.pos_ratio * self.n_sample)
pos_index = np.where(label == 1)[0] # 输出满足条件 (即非0) 元素的坐标
if len(pos_index) > n_pos:
disable_index = np.random.choice(pos_index, size=(len(pos_index) - n_pos), replace=False)
label[disable_index] = -1
# 平衡正负样本,保持总数量为256
n_neg = self.n_sample - np.sum(label == 1)
neg_index = np.where(label == 0)[0] # 输出满足条件 (即非0) 元素的坐标
if len(neg_index) > n_neg:
disable_index = np.random.choice(
neg_index, size=(len(neg_index) - n_neg), replace=False)
label[disable_index] = -1
return argmax_ious, label
def _calc_ious(self, anchor, bbox):
# 计算所有
ious = bbox_iou(anchor, bbox)
# 行是先验框,列是真实框
argmax_ious = ious.argmax(axis=1)
# 找出每一个先验框对应真实框最大的iou
max_ious = ious[np.arange(len(anchor)), argmax_ious]
# 行是先验框,列是真实框
gt_argmax_ious = ious.argmax(axis=0)
# 找到每一个真实框对应的先验框最大的iou
gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])]
# 每一个真实框对应的最大的先验框的序号
gt_argmax_ious = np.where(ious == gt_max_ious)[0] # 输出满足条件 (即非0) 元素的坐标
return argmax_ious, max_ious, gt_argmax_ious
def bbox_iou(bbox_a, bbox_b):
# IoU = (result ∩ GT) / (result ∪ GT)
if bbox_a.shape[1] != 4 or bbox_b.shape[1] != 4:
print(bbox_a, bbox_b)
raise IndexError
# tl为左上角坐标最大值,为了利用numpy的广播机制,
# bbox_a[:,None,:2]会得到一个(N,1,2)shape的数组, bbox_b[:,:2]会得到一个(K,2)shape的数组
# 由np的广播性质 两个数组shape都会编成(N,K,2) 也就是对a的每个box都会分别和b的每个box取左上角坐标最大值
# 广播机制:
# 1. NumPy 首先会比较两个数组最靠右的维度,如果最靠右的维度相等或其中一个为1,则认为此维度相等;
# 2. 那么,再继续向左比较,如果一直满足,则认为两者兼容;
# 3. 最后,分别在对应维度上发生广播,以此补齐直到维度一致
tl = np.maximum(bbox_a[:, None, :2], bbox_b[:, :2]) # np.maximum() 逐位比较,选择最大值
br = np.minimum(bbox_a[:, None, 2:], bbox_b[:, 2:])
# 首先prod 是返回给定轴上数组元素的乘积 [N, K, 2]将变成[N, K] 将会少调最后一个轴
# 当tl< br的时候 返回(y1max - y1min) * (xmax - xmin) 即bboxa 和bboxb 相交的区域
# np.prod(): 计算所有元素的乘积,对于有多个维度的数组可以指定轴
area_i = np.prod(br - tl, axis=2) * (tl < br).all(axis=2) # all():判断给定轴向上的所有元素是否都为True,零为False,其他为True
area_a = np.prod(bbox_a[:, 2:] - bbox_a[:, :2], axis=1)
area_b = np.prod(bbox_b[:, 2:] - bbox_b[:, :2], axis=1)
# 计算iou 将会是(N,K)纬度的输出,如果所有tl都大于br的话
return area_i / (area_a[:, None] + area_b - area_i) # [:, None]增加了一个维度, 此处为增加1维
# 已知建议框与GT BOX,求出位置偏差loc(dx,dy,dw,dh)
def bbox2loc(src_bbox, dst_bbox):
# src_bbox: 建议框
# dit_bbox: GT BOX
width = src_bbox[:, 2] - src_bbox[:, 0]
height = src_bbox[:, 3] - src_bbox[:, 1]
ctr_x = src_bbox[:, 0] + 0.5 * width
ctr_y = src_bbox[:, 1] + 0.5 * height
base_width = dst_bbox[:, 2] - dst_bbox[:, 0]
base_height = dst_bbox[:, 3] - dst_bbox[:, 1]
base_ctr_x = dst_bbox[:, 0] + 0.5 * base_width
base_ctr_y = dst_bbox[:, 1] + 0.5 * base_height
eps = np.finfo(height.dtype).eps
width = np.maximum(width, eps)
height = np.maximum(height, eps)
dx = (base_ctr_x - ctr_x) / width
dy = (base_ctr_y - ctr_y) / height
dw = np.log(base_width / width)
dh = np.log(base_height / height)
# 二维数组,transpose不指定参数,默认是矩阵转置
loc = np.vstack((dx, dy, dw, dh)).transpose() # vstack 按垂直方向(行顺序)堆叠数组构成一个新的数组
return loc
# 8 ROI网络训练
# 产生分类和回归的ground truth,后续计算faster rcnn的loss时需要用到
class ProposalTargetCreator(object):
def __init__(self, n_sample=128,
pos_ratio=0.5, pos_iou_thresh=0.5,
neg_iou_thresh_hi=0.5, neg_iou_thresh_lo=0.0
):
self.n_sample = n_sample
self.pos_ratio = pos_ratio
self.pos_iou_thresh = pos_iou_thresh
self.neg_iou_thresh_hi = neg_iou_thresh_hi
self.neg_iou_thresh_lo = neg_iou_thresh_lo # NOTE:default 0.1 in py-faster-rcnn
def __call__(self, roi, bbox, label,
loc_normalize_mean=(0., 0., 0., 0.),
loc_normalize_std=(0.1, 0.1, 0.2, 0.2)):
n_bbox, _ = bbox.shape
# 计算正样本
roi = np.concatenate((roi.detach().cpu().numpy(), bbox), axis=0)
pos_roi_per_image = np.round(self.n_sample * self.pos_ratio)
iou = bbox_iou(roi, bbox)
gt_assignment = iou.argmax(axis=1)
max_iou = iou.max(axis=1)
# 真实框的标签要+1因为有背景的存在
gt_roi_label = label[gt_assignment] + 1
# 找到大于门限的真实框的索引
pos_index = np.where(max_iou >= self.pos_iou_thresh)[0]
pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size))
if pos_index.size > 0:
pos_index = np.random.choice(
pos_index, size=pos_roi_per_this_image, replace=False)
# 正负样本的平衡,满足建议框和真实框重合程度小于neg_iou_thresh_hi大于neg_iou_thresh_lo作为负样本
neg_index = np.where((max_iou < self.neg_iou_thresh_hi) &
(max_iou >= self.neg_iou_thresh_lo))[0]
if neg_index.size > 0:
try:
neg_index = np.random.choice(
neg_index, size=self.n_sample - pos_roi_per_this_image, replace=False)
except:
neg_index = np.random.choice(
neg_index, size=self.n_sample - pos_roi_per_this_image, replace=True)
# 取出这些框对应的标签
keep_index = np.append(pos_index, neg_index)
gt_roi_label = gt_roi_label[keep_index]
gt_roi_label[pos_roi_per_this_image:] = 0
sample_roi = roi[keep_index]
# 找到
gt_roi_loc = bbox2loc(sample_roi, bbox[gt_assignment[keep_index]])
gt_roi_loc = ((gt_roi_loc - np.array(loc_normalize_mean, np.float32)
) / np.array(loc_normalize_std, np.float32))
return sample_roi, gt_roi_loc, gt_roi_label
# 9 loss 函数
from collections import namedtuple
LossTuple = namedtuple('LossTuple', ['rpn_loc_loss', 'rpn_cls_loss', 'roi_loc_loss', 'roi_cls_loss', 'total_loss'])
'''
1. 首先使用backbone网络提取输入图片的特征
2. 使用RPN网络来提取rois
3. 如果是训练,得到proposal_target,即分类和回归的ground truth,后续计算faster rcnn(6, 7)的loss时需要用到(8, 9)
4. 使用roi_pooling得到rois的feature map
5. 使用classifier提取特征
6. 使用faster_rcnn_cls得到分类结果
7. 使用faster_rcnn_reg得到回归结果
8. 如果是训练,计算分类loss
9. 如果是训练,计算回归loss
'''
class FasterRCNNTrainer(nn.Module):
def __init__(self, faster_rcnn, optimizer):
super(FasterRCNNTrainer, self).__init__()
self.faster_rcnn = faster_rcnn
self.rpn_sigma = 1
self.roi_sigma = 1
# 编码anchor与RPN
self.anchor_target_creator = AnchorTargetCreator()
self.proposal_target_creator = ProposalTargetCreator()
self.loc_normalize_mean = [0, 0, 0, 0]
self.loc_normalize_std = [0.1, 0.1, 0.2, 0.2]
self.optimizer = optimizer
def forward(self, imgs, bboxes, labels, scale):
n = imgs.shape[0]
img_size = imgs.shape[2:]
# 1. 获取公用特征层
base_feature = self.faster_rcnn.extractor(imgs)
# 2. 利用rpn网络获得先验框的得分与调整参数
rpn_locs, rpn_scores, rois, roi_indices, anchor = self.faster_rcnn.rpn(base_feature, img_size, scale)
# 将所有loss值都赋值为0
rpn_loc_loss_all, rpn_cls_loss_all, roi_loc_loss_all, roi_cls_loss_all = 0, 0, 0, 0
for i in range(n):
bbox = bboxes[i]
label = labels[i]
rpn_loc = rpn_locs[i]
rpn_score = rpn_scores[i]
roi = rois[roi_indices == i]
feature = base_feature[i]
# 3. 利用真实框和先验框获得建议框网络应该有的预测结果
# 给每个先验框都打上标签
# gt_rpn_loc [num_anchors, 4]
# gt_rpn_label [num_anchors, ]
gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(bbox, anchor, img_size)
gt_rpn_loc = torch.Tensor(gt_rpn_loc)
gt_rpn_label = torch.Tensor(gt_rpn_label).long()
if rpn_loc.is_cuda:
gt_rpn_loc = gt_rpn_loc.cuda()
gt_rpn_label = gt_rpn_label.cuda()
# 4. 分别计算建议框网络的回归损失和分类损失
rpn_loc_loss = _fast_rcnn_loc_loss(rpn_loc, gt_rpn_loc, gt_rpn_label, self.rpn_sigma)
rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label, ignore_index=-1)
# 5. 利用真实框和建议框获得classifier网络应该有的预测结果
# 获得三个变量,分别是sample_roi, gt_roi_loc, gt_roi_label
# sample_roi [n_sample, ]
# gt_roi_loc [n_sample, 4]
# gt_roi_label [n_sample, ]
sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(roi, bbox, label,
self.loc_normalize_mean,
self.loc_normalize_std)
sample_roi = torch.Tensor(sample_roi)
gt_roi_loc = torch.Tensor(gt_roi_loc)
gt_roi_label = torch.Tensor(gt_roi_label).long()
sample_roi_index = torch.zeros(len(sample_roi))
if feature.is_cuda:
sample_roi = sample_roi.cuda()
sample_roi_index = sample_roi_index.cuda()
gt_roi_loc = gt_roi_loc.cuda()
gt_roi_label = gt_roi_label.cuda()
# 6. 获得classifier网络预测结果
roi_cls_loc, roi_score = self.faster_rcnn.head(torch.unsqueeze(feature, 0), sample_roi, sample_roi_index,
img_size)
# 根据建议框的种类,取出对应的回归预测结果
n_sample = roi_cls_loc.size()[1]
roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4)
roi_loc = roi_cls_loc[torch.arange(0, n_sample), gt_roi_label]
# 7. 分别计算Classifier网络的回归损失和分类损失
roi_loc_loss = _fast_rcnn_loc_loss(roi_loc, gt_roi_loc, gt_roi_label.data, self.roi_sigma)
roi_cls_loss = nn.CrossEntropyLoss()(roi_score[0], gt_roi_label)
rpn_loc_loss_all += rpn_loc_loss
rpn_cls_loss_all += rpn_cls_loss
roi_loc_loss_all += roi_loc_loss
roi_cls_loss_all += roi_cls_loss
losses = [rpn_loc_loss_all / n, rpn_cls_loss_all / n, roi_loc_loss_all / n, roi_cls_loss_all / n]
losses = losses + [sum(losses)]
return LossTuple(*losses)
def train_step(self, imgs, bboxes, labels, scale):
self.optimizer.zero_grad()
losses = self.forward(imgs, bboxes, labels, scale)
losses.total_loss.backward()
self.optimizer.step()
return losses
def _smooth_l1_loss(x, t, sigma):
sigma_squared = sigma ** 2
regression_diff = (x - t)
regression_diff = regression_diff.abs()
regression_loss = torch.where(
regression_diff < (1. / sigma_squared),
0.5 * sigma_squared * regression_diff ** 2,
regression_diff - 0.5 / sigma_squared
)
return regression_loss.sum()
def _fast_rcnn_loc_loss(pred_loc, gt_loc, gt_label, sigma):
pred_loc = pred_loc[gt_label > 0]
gt_loc = gt_loc[gt_label > 0]
loc_loss = _smooth_l1_loss(pred_loc, gt_loc, sigma)
num_pos = (gt_label > 0).sum().float()
loc_loss /= torch.max(num_pos, torch.ones_like(num_pos))
return loc_loss
# 10 Faster R-CNN训练
from tqdm import tqdm
from torch.utils.data.dataset import Dataset
from PIL import Image
import cv2
from torch.utils.data import DataLoader
def fit_ont_epoch(net, epoch, epoch_size, epoch_size_val, gen, genval, Epoch, cuda):
total_loss = 0
rpn_loc_loss = 0
rpn_cls_loss = 0
roi_loc_loss = 0
roi_cls_loss = 0
val_toal_loss = 0
with tqdm(total=epoch_size, desc=f'Epoch {epoch + 1}/{Epoch}', postfix=dict, mininterval=0.3) as pbar:
for iteration, batch in enumerate(gen):
if iteration >= epoch_size:
break
imgs, boxes, labels = batch[0], batch[1], batch[2]
with torch.no_grad():
if cuda:
imgs = Variable(torch.from_numpy(imgs).type(torch.FloatTensor)).cuda()
else:
imgs = Variable(torch.from_numpy(imgs).type(torch.FloatTensor))
losses = train_util.train_step(imgs, boxes, labels, 1)
rpn_loc, rpn_cls, roi_loc, roi_cls, total = losses
total_loss += total.item()
rpn_loc_loss += rpn_loc.item()
rpn_cls_loss += rpn_cls.item()
roi_loc_loss += roi_loc.item()
roi_cls_loss += roi_cls.item()
pbar.set_postfix(**{'total': total_loss / (iteration + 1),
'rpn_loc': rpn_loc_loss / (iteration + 1),
'rpn_cls': rpn_cls_loss / (iteration + 1),
'roi_loc': roi_loc_loss / (iteration + 1),
'roi_cls': roi_cls_loss / (iteration + 1),
'lr': get_lr(optimizer)})
pbar.update(1)
print('Start Validation')
with tqdm(total=epoch_size, desc=f'Epoch {epoch + 1}/{Epoch}', postfix=dict, mininterval=0.3) as pbar:
for iteration, batch in enumerate(genval):
if iteration >= epoch_size:
break
imgs, boxes, labels = batch[0], batch[1], batch[2]
with torch.no_grad():
if cuda:
imgs = Variable(torch.from_numpy(imgs).type(torch.FloatTensor)).cuda()
else:
imgs = Variable(torch.from_numpy(imgs).type(torch.FloatTensor))
train_util.optimizer.zero_grad()
losses = train_util.forward(imgs, boxes, labels, 1)
_, _, _, _, val_total = losses
val_toal_loss += val_total.item()
pbar.set_postfix(**{'total_loss': val_toal_loss / (iteration + 1)})
pbar.update(1)
print('Finish Validation')
print('Epoch:' + str(epoch + 1) + '/' + str(Epoch))
print('Total Loss: {} || Val Loss: {}'.format(total_loss / (epoch_size + 1),
val_toal_loss / (epoch_size_val + 1)))
print('Saving state, iter:', str(epoch + 1))
torch.save(net.state_dict(),
'vocdataset/Epoch{}-Total_Loss{}-Val_Loss{}.pth'.format(epoch + 1, total_loss / (epoch_size + 1),
val_toal_loss / (epoch_size + 1)))
class FasterRCNN(nn.Module):
def __init__(self, num_classes, mode="training", feat_stride=16, anchor_scales=[8, 16, 32], ratios=[0.5, 1, 2],
backbone='resnet50'):
super(FasterRCNN, self).__init__()
self.feat_stride = feat_stride
if backbone == "resnet50":
self.extractor, classifier = resnet50()
self.rpn = RegionProposalNet(
in_channels=1024,
mid_channels=512,
ratios=ratios,
anchor_scales=anchor_scales,
feat_stride=feat_stride,
mode=mode
)
self.head = Resnet50RoIHead(
n_class=num_classes + 1,
roi_size=14,
spatial_scale=1,
classifier=classifier
)
def forward(self, x, scale=1.):
img_size = x.shape[2:]
base_feature = self.extractor(x)
_, _, rois, roi_indices, _ = self.rpn(base_feature, img_size, scale)
roi_cls_locs, roi_scores = self.head(base_feature, rois, roi_indices, img_size)
return roi_cls_locs, roi_scores, rois, roi_indices
def freeze_bn(self):
for m in self.modules():
if isinstance(m, nn.BatchNorm2d):
m.eval()
# 不启用 BatchNormalization 和 Dropout,保证BN和dropout不发生变化,
# pytorch框架会自动把BN和Dropout固定住,不会取平均,而是用训练好的值,
# 不然的话,一旦test的batch_size过小,很容易就会被BN层影响结果
class FRCNNDataset(Dataset):
def __init__(self, train_lines, shape=[600, 600], is_train=True):
self.train_lines = train_lines
self.train_batches = len(train_lines)
self.shape = shape
self.is_train = is_train
def __len__(self):
return self.train_batches
def rand(self, a=0, b=1):
return np.random.rand() * (b - a) + a
def get_random_data(self, annotation_line, jitter=.3, hue=.1, sat=1.5, val=1.5, random=True):
line = annotation_line.split()
image = Image.open(line[0])
iw, ih = image.size
h, w = self.shape
box = np.array([np.array(list(map(int, box.split(',')))) for box in line[1:]])
if not random:
# resize image
scale = min(w / iw, h / ih)
nw = int(iw * scale)
nh = int(ih * scale)
dx = (w - nw) // 2
dy = (h - nh) // 2
image = image.resize((nw, nh), Image.BICUBIC)
new_image = Image.new('RGB', (w, h), (128, 128, 128))
new_image.paste(image, (dx, dy))
image_data = np.array(new_image, np.float32)
# correct boxes
box_data = np.zeros((len(box), 5))
if len(box) > 0:
np.random.shuffle(box)
box[:, [0, 2]] = box[:, [0, 2]] * nw / iw + dx
box[:, [1, 3]] = box[:, [1, 3]] * nh / ih + dy
box[:, 0:2][box[:, 0:2] < 0] = 0
box[:, 2][box[:, 2] > w] = w
box[:, 3][box[:, 3] > h] = h
box_w = box[:, 2] - box[:, 0]
box_h = box[:, 3] - box[:, 1]
box = box[np.logical_and(box_w > 1, box_h > 1)]
box_data = np.zeros((len(box), 5))
box_data[:len(box)] = box
return image_data, box_data
# resize image
new_ar = w / h * self.rand(1 - jitter, 1 + jitter) / self.rand(1 - jitter, 1 + jitter)
scale = self.rand(.5, 1.5)
if new_ar < 1:
nh = int(scale * h)
nw = int(nh * new_ar)
else:
nw = int(scale * w)
nh = int(nw / new_ar)
image = image.resize((nw, nh), Image.BICUBIC)
# place image
dx = int(self.rand(0, w - nw))
dy = int(self.rand(0, h - nh))
new_image = Image.new('RGB', (w, h), (128, 128, 128))
new_image.paste(image, (dx, dy))
image = new_image
# flip image or not
flip = self.rand() < .5
if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT)
# distort image
hue = self.rand(-hue, hue)
sat = self.rand(1, sat) if self.rand() < .5 else 1 / self.rand(1, sat)
val = self.rand(1, val) if self.rand() < .5 else 1 / self.rand(1, val)
x = cv2.cvtColor(np.array(image, np.float32) / 255, cv2.COLOR_RGB2HSV)
x[..., 0] += hue * 360
x[..., 0][x[..., 0] > 1] -= 1
x[..., 0][x[..., 0] < 0] += 1
x[..., 1] *= sat
x[..., 2] *= val
x[x[:, :, 0] > 360, 0] = 360
x[:, :, 1:][x[:, :, 1:] > 1] = 1
x[x < 0] = 0
image_data = cv2.cvtColor(x, cv2.COLOR_HSV2RGB) * 255 # numpy array, 0 to 1
# correct boxes
box_data = np.zeros((len(box), 5))
if len(box) > 0:
np.random.shuffle(box)
box[:, [0, 2]] = box[:, [0, 2]] * nw / iw + dx
box[:, [1, 3]] = box[:, [1, 3]] * nh / ih + dy
if flip: box[:, [0, 2]] = w - box[:, [2, 0]]
box[:, 0:2][box[:, 0:2] < 0] = 0
box[:, 2][box[:, 2] > w] = w
box[:, 3][box[:, 3] > h] = h
box_w = box[:, 2] - box[:, 0]
box_h = box[:, 3] - box[:, 1]
box = box[np.logical_and(box_w > 1, box_h > 1)] # discard invalid box
box_data = np.zeros((len(box), 5))
box_data[:len(box)] = box
return image_data, box_data
def __getitem__(self, index):
img, y = self.get_random_data(self.train_lines[index], random=self.is_train)
img = np.transpose(img / 255.0, [2, 0, 1])
box = y[:, :4]
label = y[:, -1]
return img, box, label
# DataLoader中collate_fn使用
def frcnn_dataset_collate(batch):
images = []
bboxes = []
labels = []
for img, box, label in batch:
images.append(img)
bboxes.append(box)
labels.append(label)
images = np.array(images)
return images, bboxes, labels
def get_lr(optimizer):
for param_group in optimizer.param_groups:
return param_group['lr']
# 11 数据集划分
import os
import random
import xml.etree.ElementTree as ET
random.seed(1)
def data_ide(xmlfilepath, saveBasePath, trainval_perscent=1, train_persent=1, num=None):
temp_xml = os.listdir(xmlfilepath) # 返回指定路径下的文件和文件夹列表
total_xml = []
for xml in temp_xml:
if xml.endswith(".xml"):
total_xml.append(xml)
if num == None:
num = len(total_xml)
list = range(num)
tv = int(num * trainval_perscent)
tr = int(tv * train_persent)
trainval = random.sample(list, tv)
train = random.sample(trainval, tr)
ftrainval = open(os.path.join(saveBasePath, 'trainval.txt'), 'w')
ftest = open(os.path.join(saveBasePath, 'test.txt'), 'w')
ftrain = open(os.path.join(saveBasePath, 'train.txt'), 'w')
fval = open(os.path.join(saveBasePath, 'val.txt'), 'w')
for i in list:
if i < len(total_xml):
name = total_xml[i][:-4] + '\n'
if i in trainval:
ftrainval.write(name)
if i in train:
ftrain.write(name)
else:
fval.write(name)
else:
ftest.write(name)
else:
print("error: list out of range, please decrease 'num'")
ftrainval.close()
ftrain.close()
fval.close()
ftest.close()
def convert_annotation(xmlfilepath, image_id, list_file, classes):
in_file = open(os.path.join(xmlfilepath, str(image_id) + '.xml'), encoding='utf-8')
tree = ET.parse(in_file)
root = tree.getroot()
# print(image_id)
for obj in root.iter('object'):
difficult = 0
if obj.find('difficult') != None:
difficult = obj.find('difficult').text
cls = obj.find('name').text
if cls not in classes or int(difficult) == 1:
continue
# classes.append(cls)
cls_id = classes.index(cls)
xmlbox = obj.find('bndbox')
b = (int(float(xmlbox.find('xmin').text)),
int(float(xmlbox.find('ymin').text)),
int(float(xmlbox.find('xmax').text)),
int(float(xmlbox.find('ymax').text)))
list_file.write(" " + ",".join([str(a) for a in b]) + "," + str(cls_id))
# print(classes)
def voc_annotation(sets, classes, imagePath, BasePath, xmlfilepath):
# wd = os.getcwd() # 返回当前工作目录
for year, image_set in sets:
image_ids = open(os.path.join(BasePath, str(image_set) + '.txt'), encoding='utf-8').read().strip().split()
list_file = open(os.path.join(BasePath, '{}_{}.txt'.format(year, image_set)), 'w', encoding='utf-8')
for image_id in image_ids:
list_file.write(os.path.join(imagePath, str(image_id) + '.jpg'))
convert_annotation(xmlfilepath, image_id, list_file, classes)
list_file.write('\n')
list_file.close()
if __name__ == '__main__':
xmlfilepath = '/VOC2007/Annotations/'
imagePath = '/VOC2007/JPEGImages/'
saveBasePath = 'vocdataset/'
trainval_perscent = 0.9
train_persent = 0.8
num_data = None
sets = [('tianjin_day', 'train'), ('tianjin_day', 'val'), ('tianjin_day', 'test')]
classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
# data_ide(xmlfilepath, saveBasePath, trainval_perscent, train_persent, num_data)
# voc_annotation(sets, classes, imagePath, saveBasePath, xmlfilepath)
import torch.backends.cudnn as cudnn
import torch.optim as optim
Cuda = False
NUM_CLASSES = len(classes)
input_shape = [800, 800, 3]
backbone = "resnet50"
annotation_path_train = os.path.join(saveBasePath, '2007_train.txt')
annotation_path_val = os.path.join(saveBasePath, '2007_day_val.txt')
annotation_path_test = os.path.join(saveBasePath, '2007_day_test.txt')
#加载预训练主干网络
model_path = os.path.join(saveBasePath, 'voc_weights_resnet.pth')
model = FasterRCNN(NUM_CLASSES, backbone=backbone)
print('Loading weights into state dict...')
device = torch.device('cuda' if (torch.cuda.is_available() and Cuda) else 'cpu')
model_dict = model.state_dict()
pretrained_dict = torch.load(model_path, map_location=device)
pretrained_dict = {k: v for k, v in pretrained_dict.items() if np.shape(model_dict[k]) == np.shape(v)}
model_dict.update(pretrained_dict)
model.load_state_dict(model_dict)
print('Finished!')
net = model.train()
if Cuda:
net = torch.nn.DataParallel(model)
cudnn.benchmark = True
net = net.cuda()
with open(annotation_path_train) as f, open(annotation_path_val) as f_v, open(annotation_path_test) as f_t:
lines_train = f.readlines()
lines_val = f_v.readlines()
lines_test = f_t.readlines()
np.random.seed(1)
np.random.shuffle(lines_train)
np.random.shuffle(lines_val)
np.random.shuffle(lines_test)
np.random.seed(None)
num_train = len(lines_train)
num_val = len(lines_val)
train_file = lines_train[:]
val_file = lines_val[:]
test_file = lines_test[:]
# 主干特征提取网络特征通用,冻结训练可以加快训练速度, 也可以在训练初期防止权值被破坏.
# Init_Epoch为起始世代
# Freeze_Epoch为冻结训练的世代
# Epoch总训练世代
if True:
lr = 1e-4
Batch_size = 2
Init_Epoch = 0
Freeze_Epoch = 50
optimizer = optim.Adam(net.parameters(), lr, weight_decay=5e-4)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)
train_dataset = FRCNNDataset(train_file, (input_shape[0], input_shape[1]), is_train=True)
val_dataset = FRCNNDataset(val_file, (input_shape[0], input_shape[1]), is_train=False)
gen = DataLoader(train_dataset, shuffle=True, batch_size=Batch_size, num_workers=4, pin_memory=False,
drop_last=True, collate_fn=frcnn_dataset_collate)
gen_val = DataLoader(val_dataset, shuffle=True, batch_size=Batch_size, num_workers=4, pin_memory=False,
drop_last=True, collate_fn=frcnn_dataset_collate)
epoch_size = num_train // Batch_size
epoch_size_val = num_val // Batch_size
# 冻结一部份训练
for param in model.extractor.parameters():
param.requires_grad = False
# 冻结BN 层
model.freeze_bn()
train_util = FasterRCNNTrainer(model, optimizer)
# print('Starting to train with freeze BN...')
for epoch in range(Init_Epoch, Freeze_Epoch):
fit_ont_epoch(net, epoch, epoch_size, epoch_size_val, gen, gen_val, Freeze_Epoch, Cuda)
lr_scheduler.step()
if True:
lr = 1e-5
Batch_size = Batch_size
Freeze_Epoch = 50
Unfreeze_Epoch = 100
optimizer = optim.Adam(net.parameters(), lr, weight_decay=5e-4)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)
train_dataset = FRCNNDataset(train_file, (input_shape[0], input_shape[1]), is_train=True)
val_dataset = FRCNNDataset(val_file, (input_shape[0], input_shape[1]), is_train=False)
gen = DataLoader(train_dataset, shuffle=True, batch_size=Batch_size, num_workers=4, pin_memory=False,
drop_last=True, collate_fn=frcnn_dataset_collate)
gen_val = DataLoader(val_dataset, shuffle=True, batch_size=Batch_size, num_workers=4, pin_memory=False,
drop_last=True, collate_fn=frcnn_dataset_collate)
epoch_size = num_train // Batch_size
epoch_size_val = num_val // Batch_size
# 解冻后训练
for param in model.extractor.parameters():
param.requires_grad = True
# 冻结BN 层
model.freeze_bn()
for epoch in range(Freeze_Epoch, Unfreeze_Epoch, ):
fit_ont_epoch(net, epoch, epoch_size, epoch_size_val, gen, gen_val, Unfreeze_Epoch, Cuda)
lr_scheduler.step()