Faster-RCNN base & Faster-RCNN top (除去RPN部分) :
Github源码如下(vgg16.py
):
# --------------------------------------------------------
# Tensorflow Faster R-CNN
# Licensed under The MIT License [see LICENSE for details]
# Written by Xinlei Chen
# --------------------------------------------------------
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import math
import torchvision.models as models
from model.faster_rcnn.faster_rcnn import _fasterRCNN
import pdb
class vgg16(_fasterRCNN):
def __init__(self, classes, pretrained=False, class_agnostic=False):
self.model_path = 'data/pretrained_model/vgg16_caffe.pth'
self.dout_base_model = 512
self.pretrained = pretrained
self.class_agnostic = class_agnostic
_fasterRCNN.__init__(self, classes, class_agnostic)
def _init_modules(self):
vgg = models.vgg16()
if self.pretrained: # 后期调用的时候pretrained = True
print("Loading pretrained weights from %s" %(self.model_path))
state_dict = torch.load(self.model_path)
vgg.load_state_dict({k:v for k,v in state_dict.items() if k in vgg.state_dict()})
# list(vgg.classifier._modules.values())[:-1] :不包含最后一层全连接层
vgg.classifier = nn.Sequential(*list(vgg.classifier._modules.values())[:-1])
# not using the last maxpool layer
# 不包含conv5_3后的最大池化层
self.RCNN_base = nn.Sequential(*list(vgg.features._modules.values())[:-1])
# Fix the layers before conv3:
# vgg16的前10层在训练过程中不需要计算梯度
for layer in range(10):
for p in self.RCNN_base[layer].parameters(): p.requires_grad = False
# self.RCNN_base = _RCNN_base(vgg.features, self.classes, self.dout_base_model)
self.RCNN_top = vgg.classifier
# not using the last maxpool layer
self.RCNN_cls_score = nn.Linear(4096, self.n_classes)
if self.class_agnostic:
self.RCNN_bbox_pred = nn.Linear(4096, 4)
else:
self.RCNN_bbox_pred = nn.Linear(4096, 4 * self.n_classes)
def _head_to_tail(self, pool5):
pool5_flat = pool5.view(pool5.size(0), -1)
fc7 = self.RCNN_top(pool5_flat)
return fc7
上面的代码最好结合着下面的Faster RCNN结构图来看:
首先导入torchvision中的vgg16,默认情况下参数pretrained=False
,但是trainval_net.py
文件初始化vgg16时pretrained=True
:
之后加载预训练模型。我们可以看到源码中有这样几个重要的部分:RCNN_base
、RCNN_top
、RCNN_cls_score
、RCNN_bbox_pred
。这四个部分分别指的Faster RCNN结构中的哪一部分呢?下面通过源码分析:
1.RCNN_base
:
self.RCNN_base = nn.Sequential(*list(vgg.features._modules.values())[:-1])
实际为vgg16中从conv1_1到conv5_3之间所有的层(包括激活层和最大池化层)。
2. RCNN_top
:
vgg.classifier = nn.Sequential(*list(vgg.classifier._modules.values())[:-1])
self.RCNN_top = vgg.classifier
实际为vgg16的全连接部分,但是不包含FC-1000
,因为后面还要加两路全连接层,一个用来分类即得到候选框属于每个类别的概率,另一个用来回归得到bbox。那么分类的全连接层为RCNN_cls_score
,回归的全连接层为RCNN_bbox_pred
。
self.RCNN_cls_score = nn.Linear(4096, self.n_classes)
if self.class_agnostic:
self.RCNN_bbox_pred = nn.Linear(4096, 4)
else:
self.RCNN_bbox_pred = nn.Linear(4096, 4 * self.n_classes)
在faster_rcnn.py
文件中,源码将RCNN_cls_score
的输出送入softmax
层得到每个类别的预测概率:
cls_score = self.RCNN_cls_score(pooled_feat)
cls_prob = F.softmax(cls_score, 1)