simple-faster-rcnn-pytorch-master代码解读——训练

simple-faster-rcnn-pytorch-master代码解读——训练

个人来讲,弄明白了训练部分的代码后才完整理解了faster-rcnn的整个过程。结合这篇博客记录对代码的理解。
一.首先来看trainer.py。
trainer.py中有很多函数在train.py中调用,所以先记录trainer.py。我分块记录,方便查看。
1.__init __函数主要进行参数初始化。

from __future__ import  absolute_import
import os
from collections import namedtuple
import time
from torch.nn import functional as F
from model.utils.creator_tool import AnchorTargetCreator, ProposalTargetCreator

from torch import nn
import torch as t
from utils import array_tool as at
from utils.vis_tool import Visualizer

from utils.config import opt
from torchnet.meter import ConfusionMeter, AverageValueMeter

LossTuple = namedtuple('LossTuple',
                       ['rpn_loc_loss',
                        'rpn_cls_loss',
                        'roi_loc_loss',
                        'roi_cls_loss',
                        'total_loss'
                        ])


class FasterRCNNTrainer(nn.Module):
    def __init__(self, faster_rcnn):
        super(FasterRCNNTrainer, self).__init__()  # 初始化函数,用来初始化一些变量

        self.faster_rcnn = faster_rcnn
        self.rpn_sigma = opt.rpn_sigma   # 用来计算位置损失函数所要用到的超参数
        self.roi_sigma = opt.roi_sigma   # 用来计算位置损失函数所要用到的超参数

        # target creator create gt_bbox gt_label etc as training targets. 将真实的bbox和真实的label作为训练目标
        self.anchor_target_creator = AnchorTargetCreator()  # AnchorTargetCreator服务于RPN网络,为从20000个候选anchor中产生256个anchor进行二分类预测和位置回归预测提供真值
        self.proposal_target_creator = ProposalTargetCreator()   # 服务于ROIHearder(真正产生ROI__loc和ROI_cls的网络),从2000个筛选出的ROIS中选出128个用于rpn自我训练

        self.loc_normalize_mean = faster_rcnn.loc_normalize_mean  # 位置均值 (为进行归一化处理)
        self.loc_normalize_std = faster_rcnn.loc_normalize_std   # 位置方差

        self.optimizer = self.faster_rcnn.get_optimizer()  # 优化器,决定使用Adam还是SGD,本代码使用SGD
        # visdom wrapper
        self.vis = Visualizer(env=opt.env)  # 可视化

        # indicators for training status
        self.rpn_cm = ConfusionMeter(2)   # rpn_cm是混淆矩阵,用来验证预测值与真实值精确度,括号里的2指的是类别数(前景和背景)
        self.roi_cm = ConfusionMeter(21)  # roi_cm =21(20个object类+1个background)
        self.meters = {k: AverageValueMeter() for k in LossTuple._fields}  # average loss

2.forward函数是trainer.py的最重要的部分,进行了求损失之前的训练过程和计算了两部分的损失。

    def forward(self, imgs, bboxes, labels, scale):
       n = bboxes.shape[0]   # 获取batch个数
        if n != 1:
            raise ValueError('Currently only batch size 1 is supported.')  # 规定该程序中batch_size只能为1

        _, _, H, W = imgs.shape  # 读取图片的高和宽(图片和bbox的数据格式都是(n,c,hh,ww))
        img_size = (H, W)

        features = self.faster_rcnn.extractor(imgs)  # 提取图片的特征

        rpn_locs, rpn_scores, rois, roi_indices, anchor = \
            self.faster_rcnn.rpn(features, img_size, scale)   # 将特征放到rpn网络里面的self.faster_rcnn.rpn(feature,img_size,scale)提取出rpn_locs,rpn_scores,rois,roi_indices,anchor来

        # Since batch size is one, convert variables to singular form. 转换形式
        bbox = bboxes[0]
        label = labels[0]
        rpn_score = rpn_scores[0]
        rpn_loc = rpn_locs[0]
        roi = rois

        # Sample RoIs and forward
        # it's fine to break the computation graph of rois, 
        # consider them as constant input
        sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
            roi,
            at.tonumpy(bbox),
            at.tonumpy(label),
            self.loc_normalize_mean,
            self.loc_normalize_std)   #  经过proposal_target_creator网络产生采样过后的sample_roi,以及其对应的gt_cls_loc和gt_score。RoIHead网络利用这些sample_roi+featue为输入,输出是分类(21类)和回归(进一步微调bbox)的预测值,分类回归的真值是由ProposalTargetCreator输出的gt_roi_label和gt_roi_loc
        # NOTE it's all zero because now it only support for batch=1 now
        sample_roi_index = t.zeros(len(sample_roi))
        roi_cls_loc, roi_score = self.faster_rcnn.head(
            features,
            sample_roi,
            sample_roi_index)  

        # ------------------ RPN losses -------------------#
        gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
            at.tonumpy(bbox),
            anchor,
            img_size)       # 获得2000个anchor与边界框的偏差与类别
        gt_rpn_label = at.totensor(gt_rpn_label).long()
        gt_rpn_loc = at.totensor(gt_rpn_loc)
        rpn_loc_loss = _fast_rcnn_loc_loss(
            rpn_loc,                      # rpn_loc为rpn网络回归出来的偏移量(20000个)
            gt_rpn_loc,                 # gt_rpn_loc为anchor_target_creator函数得到2000个anchor与bbox的偏移量
            gt_rpn_label.data,         # 定位损失加label是因为负例不参与定位损失
            self.rpn_sigma)            # rpn_sigma=1

        # NOTE: default value of ignore_index is -100 ...
        rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1)  # rpn_score为20000个label与2000个label的softmax损失
        _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1]          # 只计算前景的的类
        _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1]
        self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long())

        # ------------------ ROI losses (fast rcnn loss) -------------------#
        n_sample = roi_cls_loc.shape[0]     # n_sample=128
        roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4)
        roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \
                              at.totensor(gt_roi_label).long()]
        gt_roi_label = at.totensor(gt_roi_label).long()  # 128个proposal与边界框求得的位置偏移dx,dy,dw,dh
        gt_roi_loc = at.totensor(gt_roi_loc)   

        roi_loc_loss = _fast_rcnn_loc_loss(
            roi_loc.contiguous(),
            gt_roi_loc,
            gt_roi_label.data,
            self.roi_sigma)

        roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())  # 计算softmax损失

        self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long())

        losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]  # 总的损失=4个loss的和
        losses = losses + [sum(losses)]

        return LossTuple(*losses)

3.train_step函数就是进行参数优化。

    def train_step(self, imgs, bboxes, labels, scale):  # 整个函数实际上就是进行了一次参数的优化过程
        self.optimizer.zero_grad()   # 将梯度全部置0
        losses = self.forward(imgs, bboxes, labels, scale)  # 计算所有损失
        losses.total_loss.backward()  # 反向传播计算梯度
        self.optimizer.step()  # 进行一次参数的更新
        self.update_meters(losses)  # 在可视化界面上显示所有损失
        return losses  

4.save和load部分。

    def save(self, save_optimizer=False, save_path=None, **kwargs): # 根据传入的参数来选择保存model模型或者config设置或者是other_info其他参数vis_info可视化参数
        """serialize models include optimizer and other info
        return path where the model-file is stored.

        Args:
            save_optimizer (bool): whether save optimizer.state_dict().
            save_path (string): where to save model, if it's None, save_path
                is generate using time str and info from kwargs.
        
        Returns:
            save_path(str): the path to save models.保存模型的路径
        """
        save_dict = dict()

        save_dict['model'] = self.faster_rcnn.state_dict()
        save_dict['config'] = opt._state_dict()
        save_dict['other_info'] = kwargs
        save_dict['vis_info'] = self.vis.state_dict()

        if save_optimizer:  # 是否保存优化器
            save_dict['optimizer'] = self.optimizer.state_dict()

        if save_path is None:  # 保存模型的路径
            timestr = time.strftime('%m%d%H%M')
            save_path = 'checkpoints/fasterrcnn_%s' % timestr
            for k_, v_ in kwargs.items():
                save_path += '_%s' % v_

        save_dir = os.path.dirname(save_path)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        t.save(save_dict, save_path)
        self.vis.save([self.vis.env])
        return save_path

    def load(self, path, load_optimizer=True, parse_opt=False, ):
        state_dict = t.load(path)
        if 'model' in state_dict:
            self.faster_rcnn.load_state_dict(state_dict['model'])
        else:  # legacy way, for backward compatibility
            self.faster_rcnn.load_state_dict(state_dict)
            return self
        if parse_opt:
            opt._parse(state_dict['config'])
        if 'optimizer' in state_dict and load_optimizer:
            self.optimizer.load_state_dict(state_dict['optimizer'])
        return self

5.update_meters,reset_meters以及get_meter_data函数。

    def update_meters(self, losses):  # 向visdom界面update数据
        loss_d = {k: at.scalar(v) for k, v in losses._asdict().items()}
        for key, meter in self.meters.items():
            meter.add(loss_d[key])

    def reset_meters(self):  #  向visdom界面reset数据
        for key, meter in self.meters.items():
            meter.reset()
        self.roi_cm.reset()
        self.rpn_cm.reset()

    def get_meter_data(self):  # 向visdom界面get数据
        return {k: v.value()[0] for k, v in self.meters.items()}

6._smooth_l1_loss函数就是计算smooth_l1损失。

def _smooth_l1_loss(x, t, in_weight, sigma):  # x代表预测,t代表真值,in_weight代表权重
    sigma2 = sigma ** 2
    diff = in_weight * (x - t)  # 被标定为背景的类的权重设置为0(忽略背景类),这也就是为什么计算位置的损失函数还要传入真实label作为参数的原因
    abs_diff = diff.abs()
    flag = (abs_diff.data < (1. / sigma2)).float()
    y = (flag * (sigma2 / 2.) * (diff ** 2) +
         (1 - flag) * (abs_diff - 0.5 / sigma2))
    return y.sum()

7._fast_rcnn_loc_loss(pred_loc,gt_loc,gt_label,sigma)函数用于计算位置损失。

def _fast_rcnn_loc_loss(pred_loc, gt_loc, gt_label, sigma):   
    in_weight = t.zeros(gt_loc.shape).cuda()
    # Localization loss is calculated only for positive rois.
    # NOTE:  unlike origin implementation, 
    # we don't need inside_weight and outside_weight, they can calculate by gt_label
    in_weight[(gt_label > 0).view(-1, 1).expand_as(in_weight).cuda()] = 1  # 将不是背景的anchor或ROIs的位置加入到损失函数的计算中来,方法是只给不是背景的anchor/ROIs的in_weight设置为1
    loc_loss = _smooth_l1_loss(pred_loc, gt_loc, in_weight.detach(), sigma)  # sigma=1
    # Normalize by total number of negtive and positive rois.
    loc_loss /= ((gt_label >= 0).sum().float())  # 只计算前景类
    return loc_loss

二.再来看trainer.py。
1.eval函数是用来评估预测结果好坏的函数。

from __future__ import  absolute_import
import os

import ipdb
import matplotlib
from tqdm import tqdm

from utils.config import opt
from data.dataset import Dataset, TestDataset, inverse_normalize
from model import FasterRCNNVGG16
from torch.utils import data as data_
from trainer import FasterRCNNTrainer
from utils import array_tool as at
from utils.vis_tool import visdom_bbox
from utils.eval_tool import eval_detection_voc

# fix for ulimit
# https://github.com/pytorch/pytorch/issues/973#issuecomment-346405667
import resource

rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (20480, rlimit[1]))

matplotlib.use('agg')

def eval(dataloader, faster_rcnn, test_num=10000):
    pred_bboxes, pred_labels, pred_scores = list(), list(), list()
    gt_bboxes, gt_labels, gt_difficults = list(), list(), list()    # 定义了预测和真实的框的位置,类别和分数的列表
    for ii, (imgs, sizes, gt_bboxes_, gt_labels_, gt_difficults_) in tqdm(enumerate(dataloader))   #读取数据
        sizes = [sizes[0][0].item(), sizes[1][0].item()]
        pred_bboxes_, pred_labels_, pred_scores_ = faster_rcnn.predict(imgs, [sizes])  # 利用faster_rcnn.predict预测bbox的位置,label以及分数。
        gt_bboxes += list(gt_bboxes_.numpy())  # 添加预测值和真实值到列表中
        gt_labels += list(gt_labels_.numpy())
        gt_difficults += list(gt_difficults_.numpy())
        pred_bboxes += pred_bboxes_
        pred_labels += pred_labels_
        pred_scores += pred_scores_
        if ii == test_num: break  # 迭代次=test_num就跳出循环

    result = eval_detection_voc(
        pred_bboxes, pred_labels, pred_scores,
        gt_bboxes, gt_labels, gt_difficults,
        use_07_metric=True)   # 接收列表参数,得到预测结果
    return result

2.train(**kwargs)函数是整个网络的训练部分,这部分一定要弄明白。

def train(**kwargs):
    opt._parse(kwargs)  

    dataset = Dataset(opt) # 获取数据存储的路径
    print('load data')
    dataloader = data_.DataLoader(dataset, \
                                  batch_size=1, \
                                  shuffle=True, \ # 允许数据打乱排序
                                  # pin_memory=True,
                                  num_workers=opt.num_workers)  # 设置数据几次处理完
    testset = TestDataset(opt)  # 与上边进行相同操作
    test_dataloader = data_.DataLoader(testset,
                                       batch_size=1,
                                       num_workers=opt.test_num_workers,
                                       shuffle=False, \
                                       pin_memory=True
                                       )
    faster_rcnn = FasterRCNNVGG16()  # 定义模型
    print('model construct completed')
    trainer = FasterRCNNTrainer(faster_rcnn).cuda()  # 使用VGG16模型
    if opt.load_path:  # 判断opt.load_path是否存在
        trainer.load(opt.load_path) # 读取pre_train模型
        print('load pretrained model from %s' % opt.load_path)
    trainer.vis.text(dataset.db.label_names, win='labels')  # 可视化操作
    best_map = 0
    lr_ = opt.lr
    for epoch in range(opt.epoch):  # 训练
        trainer.reset_meters()  # 在可视化界面reset所有数据
        for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)):  
            scale = at.scalar(scale)                                       # 设置缩放范围
            img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()  # gpu加速
            trainer.train_step(img, bbox, label, scale)  # 参数优化

            if (ii + 1) % opt.plot_every == 0:  # 读取次数是否达到了画图次数
                if os.path.exists(opt.debug_file):  
                    ipdb.set_trace()  # 设置断点

                # plot loss
                trainer.vis.plot_many(trainer.get_meter_data())  # 读取训练数据并上传进行可视化

                # plot groud truth bboxes
                ori_img_ = inverse_normalize(at.tonumpy(img[0]))  # 预处理迭代读取的图片
                gt_img = visdom_bbox(ori_img_,
                                     at.tonumpy(bbox_[0]),
                                     at.tonumpy(label_[0])) 
                trainer.vis.img('gt_img', gt_img)  # 读取原始数据中的原图,边界框,标签,并显示在visdom界面

                # plot predicti bboxes
                _bboxes, _labels, _scores = trainer.faster_rcnn.predict([ori_img_], visualize=True)  # 预测并保存结果
                pred_img = visdom_bbox(ori_img_,
                                       at.tonumpy(_bboxes[0]),
                                       at.tonumpy(_labels[0]).reshape(-1),
                                       at.tonumpy(_scores[0]))
                trainer.vis.img('pred_img', pred_img)  # 同理将原始图片以及预测边框和预测类别显示在visdom界面

                # rpn confusion matrix(meter)
                trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm')  # 在visdom界面显示混淆矩阵
                # roi confusion matrix
                trainer.vis.img('roi_cm', at.totensor(trainer.roi_cm.conf, False).float())  # 在visdom界面以图片的形式显示可视化矩阵
        # 接下来是测试阶段的代码
        eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num)  # 将测试数据进行评价
        trainer.vis.plot('test_map', eval_result['map'])  # 在visdom界面显示map
        lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr']   # 设置学习率
        log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_),  # 更新损失学习率以及map
                                                  str(eval_result['map']),
                                                  str(trainer.get_meter_data()))
        trainer.vis.log(log_info)

        if eval_result['map'] > best_map:  # 保存效果最好的map
            best_map = eval_result['map']
            best_path = trainer.save(best_map=best_map)
        if epoch == 9:  # 学习9次就将学习率变成原来的十分之一
            trainer.load(best_path)
            trainer.faster_rcnn.scale_lr(opt.lr_decay)
            lr_ = lr_ * opt.lr_decay

        if epoch == 13:   # 结束训练验证过程
            break

至此,对Faster RCNN的理解结束了,作为一个刚开始研究目标检测的学生,在结合别人对Faster RCNN的理解后终于自己基本看明白了代码,希望我也希望大家科研道路一切顺利~~

  • 2
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值