本文为个人学习过程中所记录笔记,便于梳理思路和后续查看用,如有错误,感谢批评指正!
paper:MASK RCNN
pytorch版本code参考:pytorch-mask-rcnn
参考
mask rcnn的整体框架如图所示:
roi aglin
在roi pooling当中,下采样以后,以及在划分格子进行pooling的过程中,均会出现小数点,这里都做了近似处理,会损失一定的精度。因为,roi aglin算法采用双线性插值,提取。非整数值位置处的特征值。示意图如图所示:参考Faster-RCNN详解和torchvision源码解读(七):roi aglin
代码参考pytorch-mask-rcnn进行学习
backbone
这里选择resnet提取特征,返回C1, C2, C3, C4, C5五种不同尺度特征。
neck
采用FPN,返回p2_out, p3_out, p4_out, p5_out, p6_out多种融合的特征
```python
class FPN(nn.Module):
def __init__(self, C1, C2, C3, C4, C5, out_channels):
super(FPN, self).__init__()
self.out_channels = out_channels
self.C1 = C1
self.C2 = C2
self.C3 = C3
self.C4 = C4
self.C5 = C5
self.P6 = nn.MaxPool2d(kernel_size=1, stride=2)
self.P5_conv1 = nn.Conv2d(2048, self.out_channels, kernel_size=1, stride=1)
self.P5_conv2 = nn.Sequential(
SamePad2d(kernel_size=3, stride=1),
nn.Conv2d(self.out_channels, self.out_channels, kernel_size=3, stride=1),
)
self.P4_conv1 = nn.Conv2d(1024, self.out_channels, kernel_size=1, stride=1)
self.P4_conv2 = nn.Sequential(
SamePad2d(kernel_size=3, stride=1),
nn.Conv2d(self.out_channels, self.out_channels, kernel_size=3, stride=1),
)
self.P3_conv1 = nn.Conv2d(512, self.out_channels, kernel_size=1, stride=1)
self.P3_conv2 = nn.Sequential(
SamePad2d(kernel_size=3, stride=1),
nn.Conv2d(self.out_channels, self.out_channels, kernel_size=3, stride=1),
)
self.P2_conv1 = nn.Conv2d(256, self.out_channels, kernel_size=1, stride=1)
self.P2_conv2 = nn.Sequential(
SamePad2d(kernel_size=3, stride=1),
nn.Conv2d(self.out_channels, self.out_channels, kernel_size=3, stride=1),
)
def forward(self, x):
x = self.C1(x) #调用RESNET类里的self.C1,输出为64 * 4维
x = self.C2(x) #调用RESNET类里的self.C2,输出为64 * 4维
c2_out = x #C2输出
x = self.C3(x) #调用RESNET类里的self.C3,输出为128 * 4维
c3_out = x #C3输出
x = self.C4(x) #调用RESNET类里的self.C4,输出为256 * 4维
c4_out = x #C4输出
x = self.C5(x) #调用RESNET类里的self.C5,输出为512 * 4维
p5_out = self.P5_conv1(x) #调整2028 >> 256
p4_out = self.P4_conv1(c4_out) + F.upsample(p5_out, scale_factor=2) #4和5融合
p3_out = self.P3_conv1(c3_out) + F.upsample(p4_out, scale_factor=2) #3和4融合
p2_out = self.P2_conv1(c2_out) + F.upsample(p3_out, scale_factor=2) #2和3融合
p5_out = self.P5_conv2(p5_out)
p4_out = self.P4_conv2(p4_out)
p3_out = self.P3_conv2(p3_out)
p2_out = self.P2_conv2(p2_out)
# P6 is used for the 5th anchor scale in RPN. Generated by
# subsampling from P5 with stride of 2.
p6_out = self.P6(p5_out)
return [p2_out, p3_out, p4_out, p5_out, p6_out]
RPN
争对p2_out, p3_out, p4_out, p5_out, p6_out多个特征,分别用于生成候选框,每个特征返回一组分类softmax前的特征rpn_class_logits,anchor分数rpn_probs以及anchor回归box框rpn_bbox。
class RPN(nn.Module):
"""Builds the model of Region Proposal Network.
anchors_per_location: number of anchors per pixel in the feature map
anchor_stride: Controls the density of anchors. Typically 1 (anchors for
every pixel in the feature map), or 2 (every other pixel).
Returns:
rpn_logits: [batch, H, W, 2] Anchor classifier logits (before softmax)
rpn_probs: [batch, W, W, 2] Anchor classifier probabilities.
rpn_bbox: [batch, H, W, (dy, dx, log(dh), log(dw))] Deltas to be
applied to anchors.
"""
def __init__(self, anchors_per_location, anchor_stride, depth): #anchors_per_location = len(config.RPN_ANCHOR_RATIOS)
super(RPN, self).__init__()
self.anchors_per_location = anchors_per_location
self.anchor_stride = anchor_stride
self.depth = depth
self.padding = SamePad2d(kernel_size=3, stride=self.anchor_stride)
self.conv_shared = nn.Conv2d(self.depth, 512, kernel_size=3, stride=self.anchor_stride)
self.relu = nn.ReLU(inplace=True)
self.conv_class = nn.Conv2d(512, 2 * anchors_per_location, kernel_size=1, stride=1)
self.softmax = nn.Softmax(dim=2)
self.conv_bbox = nn.Conv2d(512, 4 * anchors_per_location, kernel_size=1, stride=1)
def forward(self, x):
# Shared convolutional base of the RPN
x = self.relu(self.conv_shared(self.padding(x))) #3x3共享卷积,维度256 >> 512
# Anchor Score. [batch, anchors per location * 2, height, width].
rpn_class_logits = self.conv_class(x) #卷积,维度512>> anchors_per_location * 2
rpn_class_logits = rpn_class_logits.permute(0,2,3,1) # Reshape to [batch, height, width,anchors per location * 2]
rpn_class_logits = rpn_class_logits.contiguous()
rpn_class_logits = rpn_class_logits.view(x.size()[0], -1, 2) # Reshape to [batch, heigh*width,anchors per location * 2]
# Softmax on last dimension of BG/FG.
rpn_probs = self.softmax(rpn_class_logits) #正负样本区分
# Bounding box refinement. [batch, H, W, anchors per location, depth]
# where depth is [x, y, log(w), log(h)]
rpn_bbox = self.conv_bbox(x) #回归分支,卷积
# Reshape to [batch, 4, anchors]
rpn_bbox = rpn_bbox.permute(0,2,3,1)
rpn_bbox = rpn_bbox.contiguous()
rpn_bbox = rpn_bbox.view(x.size()[0], -1, 4)
return [rpn_class_logits, rpn_probs, rpn_bbox]
分类
分类网络的输入为fpn的输出[p2_out, p3_out, p4_out, p5_out],以及rpn的输出(各个特征cat)包括anchor分数和anchor box。就是一个常见的分类网络就不细说了。返回值为全连接输出结果类别(维度为num_classes),全连接归一化输出结果(维度为num_classes),全连接回归输出结果回归框(维度为num_classes*4)。
class Classifier(nn.Module):
def __init__(self, depth, pool_size, image_shape, num_classes):
super(Classifier, self).__init__()
self.depth = depth
self.pool_size = pool_size
self.image_shape = image_shape
self.num_classes = num_classes
self.conv1 = nn.Conv2d(self.depth, 1024, kernel_size=self.pool_size, stride=1)
self.bn1 = nn.BatchNorm2d(1024, eps=0.001, momentum=0.01)
self.conv2 = nn.Conv2d(1024, 1024, kernel_size=1, stride=1)
self.bn2 = nn.BatchNorm2d(1024, eps=0.001, momentum=0.01)
self.relu = nn.ReLU(inplace=True)
self.linear_class = nn.Linear(1024, num_classes)
self.softmax = nn.Softmax(dim=1)
self.linear_bbox = nn.Linear(1024, num_classes * 4)
def forward(self, x, rois):
x = pyramid_roi_align([rois]+x, self.pool_size, self.image_shape) #roi align控制输入尺度大小,因为有全连接
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.conv2(x)
x = self.bn2(x)
x = self.relu(x)
x = x.view(-1,1024)
mrcnn_class_logits = self.linear_class(x) #全连接输出结果,输出为num_classes
mrcnn_probs = self.softmax(mrcnn_class_logits) #归一化
mrcnn_bbox = self.linear_bbox(x) #全连接输出结果,输出为num_classes * 4
mrcnn_bbox = mrcnn_bbox.view(mrcnn_bbox.size()[0], -1, 4)
return [mrcnn_class_logits, mrcnn_probs, mrcnn_bbox]
MASK分支
输入为fpn的输出p2_out, p3_out, p4_out, p5_out以及检测框,由此可见,maskrcnn的分割是基于检测做的,经过roi align以后多次卷积便可以。
class Mask(nn.Module):
def __init__(self, depth, pool_size, image_shape, num_classes):
super(Mask, self).__init__()
self.depth = depth
self.pool_size = pool_size
self.image_shape = image_shape
self.num_classes = num_classes
self.padding = SamePad2d(kernel_size=3, stride=1)
self.conv1 = nn.Conv2d(self.depth, 256, kernel_size=3, stride=1)
self.bn1 = nn.BatchNorm2d(256, eps=0.001)
self.conv2 = nn.Conv2d(256, 256, kernel_size=3, stride=1)
self.bn2 = nn.BatchNorm2d(256, eps=0.001)
self.conv3 = nn.Conv2d(256, 256, kernel_size=3, stride=1)
self.bn3 = nn.BatchNorm2d(256, eps=0.001)
self.conv4 = nn.Conv2d(256, 256, kernel_size=3, stride=1)
self.bn4 = nn.BatchNorm2d(256, eps=0.001)
self.deconv = nn.ConvTranspose2d(256, 256, kernel_size=2, stride=2)
self.conv5 = nn.Conv2d(256, num_classes, kernel_size=1, stride=1)
self.sigmoid = nn.Sigmoid()
self.relu = nn.ReLU(inplace=True)
def forward(self, x, rois):
x = pyramid_roi_align([rois] + x, self.pool_size, self.image_shape) #roi align
x = self.conv1(self.padding(x))
x = self.bn1(x)
x = self.relu(x)
x = self.conv2(self.padding(x))
x = self.bn2(x)
x = self.relu(x)
x = self.conv3(self.padding(x))
x = self.bn3(x)
x = self.relu(x)
x = self.conv4(self.padding(x))
x = self.bn4(x)
x = self.relu(x)
x = self.deconv(x)
x = self.relu(x)
x = self.conv5(x)
x = self.sigmoid(x)
return x
pyramid_roi_align
def pyramid_roi_align(inputs, pool_size, image_shape):
"""Implements ROI Pooling on multiple levels of the feature pyramid.
Params:
- pool_size: [height, width] of the output pooled regions. Usually [7, 7]
- image_shape: [height, width, channels]. Shape of input image in pixels
Inputs:
- boxes: [batch, num_boxes, (y1, x1, y2, x2)] in normalized
coordinates.
- Feature maps: List of feature maps from different levels of the pyramid.
Each is [batch, channels, height, width]
Output:
Pooled regions in the shape: [num_boxes, height, width, channels].
The width and height are those specific in the pool_shape in the layer
constructor.
"""
# Currently only supports batchsize 1
for i in range(len(inputs)):
inputs[i] = inputs[i].squeeze(0)
# Crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coords
boxes = inputs[0]
# Feature Maps. List of feature maps from different level of the
# feature pyramid. Each is [batch, height, width, channels]
feature_maps = inputs[1:]
# Assign each ROI to a level in the pyramid based on the ROI area.
y1, x1, y2, x2 = boxes.chunk(4, dim=1)
h = y2 - y1
w = x2 - x1
# Equation 1 in the Feature Pyramid Networks paper. Account for
# the fact that our coordinates are normalized here.
# e.g. a 224x224 ROI (in pixels) maps to P4
image_area = Variable(torch.FloatTensor([float(image_shape[0]*image_shape[1])]), requires_grad=False)
if boxes.is_cuda:
image_area = image_area.cuda()
roi_level = 4 + log2(torch.sqrt(h*w)/(224.0/torch.sqrt(image_area)))
roi_level = roi_level.round().int()
roi_level = roi_level.clamp(2,5)
# Loop through levels and apply ROI pooling to each. P2 to P5.
pooled = []
box_to_level = []
for i, level in enumerate(range(2, 6)):
ix = roi_level==level
if not ix.any():
continue
ix = torch.nonzero(ix)[:,0]
level_boxes = boxes[ix.data, :]
# Keep track of which box is mapped to which level
box_to_level.append(ix.data)
# Stop gradient propogation to ROI proposals
level_boxes = level_boxes.detach()
# Crop and Resize
# From Mask R-CNN paper: "We sample four regular locations, so
# that we can evaluate either max or average pooling. In fact,
# interpolating only a single value at each bin center (without
# pooling) is nearly as effective."
#
# Here we use the simplified approach of a single value per bin,
# which is how it's done in tf.crop_and_resize()
# Result: [batch * num_boxes, pool_height, pool_width, channels]
ind = Variable(torch.zeros(level_boxes.size()[0]),requires_grad=False).int()
if level_boxes.is_cuda:
ind = ind.cuda()
feature_maps[i] = feature_maps[i].unsqueeze(0) #CropAndResizeFunction needs batch dimension
pooled_features = CropAndResizeFunction(pool_size, pool_size, 0)(feature_maps[i], level_boxes, ind)
pooled.append(pooled_features)
# Pack pooled features into one tensor
pooled = torch.cat(pooled, dim=0)
# Pack box_to_level mapping into one array and add another
# column representing the order of pooled boxes
box_to_level = torch.cat(box_to_level, dim=0)
# Rearrange pooled features to match the order of the original boxes
_, box_to_level = torch.sort(box_to_level)
pooled = pooled[box_to_level, :, :]
return pooled