Mask_rcnn中文详解

一.在学习Mask之前,建议先看看faster_rcnn,(faster_rcnn代码解读)

Mask_rcnn关键技术:

1.多尺度检测(最早在yolo3中使用),里面用到了FPN技术

2.rpn

2.ROI Align

二:系统学习mask_rcnn过程B站视频讲解

三:代码中文注释

model.py

"""
Mask R-CNN
The main Mask R-CNN model implemenetation.
Copyright (c) 2017 Matterport, Inc.
Licensed under the MIT License (see LICENSE for details)
Written by Waleed Abdulla
"""

import datetime
import math
import os
import random
import re

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

import utils
import visualize
#from nms.nms_wrapper import nms
from roialign.roi_align.crop_and_resize import CropAndResizeFunction

############################################################
# nms
############################################################
# boxes=np.array([[100,100,210,210,0.72],
        # [250,250,420,420,0.8],
        # [220,220,320,330,0.92],
        # [100,100,210,210,0.72],
        # [230,240,325,330,0.81],
        # [220,230,315,340,0.9]]) 
def nms(dets, thresh):
    # dets:(m,5)  thresh:scaler
    x1 = dets[:,0]
    y1 = dets[:,1]
    x2 = dets[:,2]
    y2 = dets[:,3]
    areas = (y2-y1+1) * (x2-x1+1)
    scores = dets[:,4]
    keep = []
    index = scores.argsort()[::-1]
    while index.size >0:
        i = index[0]       # every time the first is the biggst, and add it directly
        keep.append(i)
        x11 = np.maximum(x1[i], x1[index[1:]])    # calculate the points of overlap 
        y11 = np.maximum(y1[i], y1[index[1:]])
        x22 = np.minimum(x2[i], x2[index[1:]])
        y22 = np.minimum(y2[i], y2[index[1:]])
        w = np.maximum(0, x22-x11+1)    # the weights of overlap
        h = np.maximum(0, y22-y11+1)    # the height of overlap
        overlaps = w*h
        ious = overlaps / (areas[i]+areas[index[1:]] - overlaps)
        idx = np.where(ious<=thresh)[0]
        index = index[idx+1]   # because index start from 1
    return keep
# import matplotlib.pyplot as plt
# def plot_bbox(dets, c='k'):
    
    # x1 = dets[:,0]
    # y1 = dets[:,1]
    # x2 = dets[:,2]
    # y2 = dets[:,3]
    
    # plt.plot([x1,x2], [y1,y1], c)
    # plt.plot([x1,x1], [y1,y2], c)
    # plt.plot([x1,x2], [y2,y2], c)
    # plt.plot([x2,x2], [y1,y2], c)
    # plt.title("after nms")  

############################################################
#  Logging Utility Functions
############################################################

def log(text, array=None):
    """Prints a text message. And, optionally, if a Numpy array is provided it
    prints it's shape, min, and max values.
    if array is not None  shape: (m,n)
    """
    if array is not None:
        text = text.ljust(25) #方法返回一个原字符串左对齐,并使用空格填充至指定长度的新字符串。如果指定的长度小于原字符串的长度则返回原字符串
        text += ("shape: {:20}  min: {:10.5f}  max: {:10.5f}".format(
            str(array.shape), #将一个元组变成一个字符串 (m,n)-> '(m,n)'
            array.min() if array.size else "",  #array.size返回矩阵的元素数量m*n, array.min()返回数组最小值
            array.max() if array.size else ""))
    print(text)

def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█'):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = '\n')
    # Print New Line on Complete
    if iteration == total:
        print()


############################################################
#  Pytorch Utility Functions
############################################################

def unique1d(tensor):
    if tensor.size()[0] == 0 or tensor.size()[0] == 1:
        return tensor
    tensor = tensor.sort()[0]  #对tensor每行进行排序
    unique_bool = tensor[1:] != tensor [:-1]  #检测除了第一行和最后一行的数据之外的数据
    first_element = Variable(torch.ByteTensor([True]), requires_grad=False).bool()
    if tensor.is_cuda:
        first_element = first_element.cuda()
    unique_bool = torch.cat((first_element, unique_bool),dim=0)
    return tensor[unique_bool.data]

#求两个一维度的行向量的交集
def intersect1d(tensor1, tensor2):  
    assert len(tensor1.shape)==1 and len(tensor2.shape)==1 and len(tensor1)>1 and len(tensor2)>1,"输入的维度为1且数据长度大于1"
    aux = torch.cat((tensor1, tensor2),dim=0)
    aux = aux.sort()[0]
    return aux[:-1][(aux[1:] == aux[:-1]).data]

def log2(x):
    """Implementatin of Log2. Pytorch doesn't have a native implemenation."""
    ln2 = Variable(torch.log(torch.FloatTensor([2.0])), requires_grad=False)
    if x.is_cuda:
        ln2 = ln2.cuda()
    return torch.log(x) / ln2

class SamePad2d(nn.Module):
    """Mimics tensorflow's 'SAME' padding.
    """

    def __init__(self, kernel_size, stride):
        super(SamePad2d, self).__init__()
        self.kernel_size = torch.nn.modules.utils._pair(kernel_size)  #函数将输入变成成对的元组
        self.stride = torch.nn.modules.utils._pair(stride) 

    def forward(self, input):
        #input (batch,c,h,w)
        in_width = input.size()[3]
        in_height = input.size()[2]
        out_width = math.ceil(float(in_width) / float(self.stride[0]))  #向上取整
        out_height = math.ceil(float(in_height) / float(self.stride[1]))
        pad_along_width = ((out_width - 1) * self.stride[0] +
                           self.kernel_size[0] - in_width)
        pad_along_height = ((out_height - 1) * self.stride[1] +
                            self.kernel_size[1] - in_height)       #这里用到out_w=(in_w +pad-k+s)/s 反向求出pad
        pad_left = math.floor(pad_along_width / 2)  #向下取整
        pad_top = math.floor(pad_along_height / 2)
        pad_right = pad_along_width - pad_left
        pad_bottom = pad_along_height - pad_top
        return F.pad(input, (pad_left, pad_right, pad_top, pad_bottom), 'constant', 0)

    def __repr__(self):
        return self.__class__.__name__


############################################################
#  FPN Graph
############################################################


class TopDownLayer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(TopDownLayer, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1)  #将通道压缩
        self.padding2 = SamePad2d(kernel_size=3, stride=1)   #初始化类SamePad2d,在特征图周围用0填充
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,stride=1)

    def forward(self, x, y):
        y = F.upsample(y, scale_factor=2) #将y的宽高扩充到2倍
        x = self.conv1(x)                 #改变x的通道数
        return self.conv2(self.padding2(x+y))  #先将x与y融合后的特征,再用0填充,保证卷积前后宽度不变
          
class FPN(nn.Module):
    def __init__(self, C1, C2, C3, C4, C5, out_channels):
        super(FPN, self).__init__()
        self.out_channels = out_channels  #每层输出的通道数相同
        self.C1 = C1    #卷积操作
        self.C2 = C2
        self.C3 = C3
        self.C4 = C4
        self.C5 = C5
        self.P6 = nn.MaxPool2d(kernel_size=1, stride=2)
        
        self.P5_conv1 = nn.Conv2d(2048, self.out_channels, kernel_size=1, stride=1)
        self.P5_conv2 = nn.Sequential(
            SamePad2d(kernel_size=3, stride=1), #先将特征图padding,用于下面的卷积操作
            nn.Conv2d(self.out_channels, self.out_channels, kernel_size=3, stride=1), #消除混叠效应
        )
        self.P4_conv1 =  nn.Conv2d(1024, self.out_channels, kernel_size=1, stride=1)
        self.P4_conv2 = nn.Sequential(
            SamePad2d(kernel_size=3, stride=1),
            nn.Conv2d(self.out_channels, self.out_channels, kernel_size=3, stride=1),
        )
        self.P3_conv1 = nn.Conv2d(512, self.out_channels, kernel_size=1, stride=1)
        self.P3_conv2 = nn.Sequential(
            SamePad2d(kernel_size=3, stride=1),
            nn.Conv2d(self.out_channels, self.out_channels, kernel_size=3, stride=1),
        )
        self.P2_conv1 = nn.Conv2d(256, self.out_channels, kernel_size=1, stride=1)
        self.P2_conv2 = nn.Sequential(
            SamePad2d(kernel_size=3, stride=1),
            nn.Conv2d(self.out_channels, self.out_channels, kernel_size=3, stride=1),
        )

    def forward(self, x):
        x = self.C1(x)
        x = self.C2(x)
        c2_out = x
        x = self.C3(x)
        c3_out = x
        x = self.C4(x)
        c4_out = x
        x = self.C5(x)
        p5_out = self.P5_conv1(x)
        p4_out = self.P4_conv1(c4_out) + F.upsample(p5_out, scale_factor=2)
        p3_out = self.P3_conv1(c3_out) + F.upsample(p4_out, scale_factor=2)
        p2_out = self.P2_conv1(c2_out) + F.upsample(p3_out, scale_factor=2)

        p5_out = self.P5_conv2(p5_out)
        p4_out = self.P4_conv2(p4_out)
        p3_out = self.P3_conv2(p3_out)
        p2_out = self.P2_conv2(p2_out)

        # P6 is used for the 5th anchor scale in RPN. Generated by
        # subsampling from P5 with stride of 2.
        p6_out = self.P6(p5_out)

        return [p2_out, p3_out, p4_out, p5_out, p6_out]


############################################################
#  Resnet Graph
############################################################
class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride)  
        self.bn1 = nn.BatchNorm2d(planes, eps=0.001, momentum=0.01) 
        self.padding2 = SamePad2d(kernel_size=3, stride=1)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3)
        self.bn2 = nn.BatchNorm2d(planes, eps=0.001, momentum=0.01)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1)   
        self.bn3 = nn.BatchNorm2d(planes * 4, eps=0.001, momentum=0.01) 
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x
        #如果x是 (n,inplanes,h,w)
        out = self.conv1(x)  #通道压缩  (n,planes,h,w)
        out = self.bn1(out)  #归一化    (n,planes,h,w)
        out = self.relu(out) #非线性化  (n,planes,h,w)

        out = self.padding2(out)  #添加padding ((n,planes,h+pad,w+pad))
        out = self.conv2(out)     #卷积调整宽高 (n,planes,h,w)
        out = self.bn2(out)       #(n,planes,h,w)
        out = self.relu(out)      #(n,planes,h,w)

        out = self.conv3(out)     #(n,4*planes,h,w)
        out = self.bn3(out)       #(n,4*planes,h,w)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out
        
class ResNet(nn.Module):

    def __init__(self, architecture, stage5=False):
        super(ResNet, self).__init__()
        assert architecture in ["resnet50", "resnet101"]
        self.inplanes = 64
        self.layers = [3, 4, {"resnet50": 6, "resnet101": 23}[architecture], 3]
        self.block = Bottleneck
        self.stage5 = stage5

        self.C1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
            nn.BatchNorm2d(64, eps=0.001, momentum=0.01),
            nn.ReLU(inplace=True),
            SamePad2d(kernel_size=3, stride=2),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.C2 = self.make_layer(self.block, 64, self.layers[0])
        self.C3 = self.make_layer(self.block, 128, self.layers[1], stride=2)
        self.C4 = self.make_layer(self.block, 256, self.layers[2], stride=2)
        if self.stage5:
            self.C5 = self.make_layer(self.block, 512, self.layers[3], stride=2)
        else:
            self.C5 = None

    def forward(self, x):
        x = self.C1(x)
        x = self.C2(x)
        x = self.C3(x)
        x = self.C4(x)
        x = self.C5(x)
        return x


    def stages(self):
        return [self.C1, self.C2, self.C3, self.C4, self.C5]

    def make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride),
                nn.BatchNorm2d(planes * block.expansion, eps=0.001, momentum=0.01),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)


############################################################
#  Proposal Layer
############################################################

def apply_box_deltas(boxes, deltas):
    """Applies the given deltas to the given boxes.
    boxes: [N, 4] where each row is y1, x1, y2, x2
    deltas: [N, 4] where each row is [dy, dx, log(dh), log(dw)]
    """
    # Convert to y, x, h, w
    height = boxes[:, 2] - boxes[:, 0]
    width = boxes[:, 3] - boxes[:, 1]
    center_y = boxes[:, 0] + 0.5 * height
    center_x = boxes[:, 1] + 0.5 * width
    # Apply deltas
    center_y += deltas[:, 0] * height
    center_x += deltas[:, 1] * width
    height *= torch.exp(deltas[:, 2])
    width *= torch.exp(deltas[:, 3])
    # Convert back to y1, x1, y2, x2
    y1 = center_y - 0.5 * height
    x1 = center_x - 0.5 * width
    y2 = y1 + height
    x2 = x1 + width
    result = torch.stack([y1, x1, y2, x2], dim=1)
    return result

def clip_boxes(boxes, window):
    """
    boxes: [N, 4] each col is y1, x1, y2, x2
    window: [4] in the form y1, x1, y2, x2
    """
    boxes = torch.stack( \
        [boxes[:, 0].clamp(float(window[0]), float(window[2])),
         boxes[:, 1].clamp(float(window[1]), float(window[3])),
         boxes[:, 2].clamp(float(window[0]), float(window[2])),
         boxes[:, 3].clamp(float(window[1]), float(window[3]))], 1)
    return boxes

def proposal_layer(inputs, proposal_count, nms_threshold, anchors, config=None):
    """Receives anchor scores and selects a subset to pass as proposals
    to the second stage. Filtering is done based on anchor scores and
    non-max suppression to remove overlaps. It also applies bounding
    box refinment detals to anchors.
    anchors: (anchors_num,4) 它是在图片尺寸维度的
    Inputs:
        rpn_probs: [batch, anchors_num, (bg prob, fg prob)]
        rpn_bbox: [batch, anchors_num, (dy, dx, log(dh), log(dw))]
    Returns:
        Proposals in normalized coordinates [batch, rois, (y1, x1, y2, x2)]
    """

    # Currently only supports batchsize 1
    # inputs是一个list [rpn_probs,rpn_bbox]
    # inputs[0] 是rpn_probs (batch,anchors_num,2)
    # inputs[1] 是rpn_bbox  (batch,anchors_num,4)
    
    inputs[0] = inputs[0].squeeze(0)  #去掉batch_size那个维度,因为每个batch只支持一张图片
    inputs[1] = inputs[1].squeeze(0)

    # Box Scores. Use the foreground class confidence. [Batch, num_rois, 1]
    scores = inputs[0][:, 1]

    # Box deltas [batch, num_rois, 4]
    deltas = inputs[1] #获得deltals,这是每张图片在第一阶段的预测输出
    #RPN_BBOX_STD_DEV = np.array([0.1, 0.1, 0.2, 0.2])
    std_dev = Variable(torch.from_numpy(np.reshape(config.RPN_BBOX_STD_DEV, [1, 4])).float(), requires_grad=False)
    if config.GPU_COUNT:   #GPU_COUNT=1,则表示使用GPU,为0,则表示使用CPU
        std_dev = std_dev.cuda()
    deltas = deltas * std_dev

    # Improve performance by trimming to top anchors by score
    # and doing the rest on the smaller subset.
    pre_nms_limit = min(6000, anchors.size()[0])   #在进行nms之前取出多少个anchors
    scores, order = scores.sort(descending=True)   #将scores进行倒序排序,返回排序后的结果,以及他们在原序列的的index
    order = order[:pre_nms_limit]                  #切片前pre_nms_limit个scores
    scores = scores[:pre_nms_limit]                #切片  (pre_nms_limit,)
    deltas = deltas[order.data, :] # TODO: Support batch size > 1 ff.  #取出scores排名前pre_nms_limit的deltas
    anchors = anchors[order.data, :]               #同上   

    # Apply deltas to anchors to get refined anchors.
    # [batch, N, (y1, x1, y2, x2)]
    boxes = apply_box_deltas(anchors, deltas)      #根据输出偏差对anchors进行修正
                                                   #将取出来的deltas进行变换成方框的左上角和右下角坐标(pre_nms_limit,4)
                                                   #(pre_nms_limit,(y1, x1, y2, x2))

    # Clip to image boundaries. [batch, N, (y1, x1, y2, x2)]
    height, width = config.IMAGE_SHAPE[:2]         #图片的边界
    window = np.array([0, 0, height, width]).astype(np.float32)  
    boxes = clip_boxes(boxes, window)       #将boxs限制在图片边界内

    # Filter out small boxes
    # According to Xinlei Chen's paper, this reduces detection accuracy
    # for small objects, so we're skipping it.

    # Non-max suppression
    #torch.cat((boxes, scores.unsqueeze(1)), 1)将scores增加一个维度变成(pre_nms_limit,1),
    #与boxes (pre_nms_limit,4)在第dims=1的维度上拼接变成维度为(pre_nms_limit,5)
    keep = nms(torch.cat((boxes, scores.unsqueeze(1)), 1).data, nms_threshold)#keep是一个list,保存的是经过nms后剩下来的box的index
    if len(keep)>proposal_count:
        keep = keep[:proposal_count]
    boxes = boxes[keep, :]

    # Normalize dimensions to range of 0 to 1.
    norm = Variable(torch.from_numpy(np.array([height, width, height, width])).float(), requires_grad=False)
    if config.GPU_COUNT:
        norm = norm.cuda()
    normalized_boxes = boxes / norm  #将boxs归一化

    # Add back batch dimension
    normalized_boxes = normalized_boxes.unsqueeze(0)  #增加一个维度 (proposal_count,4)--> (1,proposal_count,4)

    return normalized_boxes


############################################################
#  ROIAlign Layer
############################################################

def pyramid_roi_align(inputs, pool_size, image_shape):
    """Implements ROI Pooling on multiple levels of the feature pyramid.
    Params:
    - pool_size: [height, width] of the output pooled regions. Usually [7, 7]
    - image_shape: [channels,height, width]. Shape of input image in pixels  
    Inputs:
    - boxes: [batch, num_boxes, (y1, x1, y2, x2)] in normalized
             coordinates.
    - Feature maps: List of feature maps from different levels of the pyramid.
                    Each is [batch, channels, height, width]
    Output:
    Pooled regions in the shape: [num_boxes, channels, height, width,].
    The width and height are those specific in the pool_shape in the layer
    constructor.
    """

    # Currently only supports batchsize 1
    for i in range(len(inputs)):
        inputs[i] = inputs[i].squeeze(0)  #去掉batch_size那个维度

    # Crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coords
    boxes = inputs[0]  #(num_boxes,4)

    # Feature Maps. List of feature maps from different level of the
    # feature pyramid. Each is [batch,channels, height, width]
    feature_maps = inputs[1:]  #[p2,p3,p4,p5,p6]

    # Assign each ROI to a level in the pyramid based on the ROI area.
    y1, x1, y2, x2 = boxes.chunk(4, dim=1)  #将boxes沿着类方向分成四个块
    h = y2 - y1        
    w = x2 - x1

    # Equation 1 in the Feature Pyramid Networks paper. Account for
    # the fact that our coordinates are normalized here.
    # a 224x224 ROI (in pixels) maps to P4
    image_area = Variable(torch.FloatTensor([float(image_shape[1]*image_shape[2])]), requires_grad=False)
    if boxes.is_cuda:  
        image_area = image_area.cuda()
    roi_level = 4 + log2(torch.sqrt(h*w)/(224.0/torch.sqrt(image_area)))  #选在哪个特征输出层上进行ROI_pooling 
    roi_level = roi_level.round().int()   #round()向下取整
    roi_level = roi_level.clamp(2,5)   #FPN产生了[P2,P3,P4,P5,P6]五个特征层,但是只有[P2,P3,P4,P5]进行了roi_pooling


    # Loop through levels and apply ROI pooling to each. P2 to P5.
    pooled = []
    box_to_level = []
    for i, level in enumerate(range(2, 6)):  #i[0,1,2,3] level[2,3,4,5]
        ix  = roi_level==level   #bool
        if not ix.any(): #any() 函数用于判断给定的可迭代参数 iterable 是否全部为 False,则返回 False,如果有一个为 True,则返回 True
            continue
        assert len(ix.shape)>1,"ix的维度<=1"
        ix = torch.nonzero(ix)[:,0] #torch.nonzero() 输出的非零元素在矩阵中的下标矩阵 维度(m,n1,n2,...)
                                    #m是非零元素的个数,n1,n2,...则是ix除了第一维度的其他维度(对于一维的行向量,则为1)
                                    #假设 ix=tensor([0,1,0,1])  >>> 输出tensor([[1],[3]])  维度(2,1)
                                    #假设 ix=tensor([[0,1,0,1],[0,1,0,1]]) >>>输出tensor([[0,1],[0,3],[1,0],[1,3]]) 维度(4,2)
        #因为ix的维度是[n,1],torch.nonzero(ix)生成的维度是(m,2),而通过[:,0],生成的则是,ix中不为0的index
        level_boxes = boxes[ix.data, :]   #分别取出roi_level为2,3,4,5的roi
        # Keep track of which box is mapped to which level
        box_to_level.append(ix.data) #将roi的roi_level为2,3,4,5的index依次append到box_to_level中

        # Stop gradient propogation to ROI proposals
        level_boxes = level_boxes.detach()  #将抽离导数计算图,也就是说在进行反向传播是不进行求导

        # Crop and Resize
        # From Mask R-CNN paper: "We sample four regular locations, so
        # that we can evaluate either max or average pooling. In fact,
        # interpolating only a single value at each bin center (without
        # pooling) is nearly as effective."
        #
        # Here we use the simplified approach of a single value per bin,
        # which is how it's done in tf.crop_and_resize()
        # Result: [batch * num_boxes, pool_height, pool_width, channels]
        per_level_boxes_num=level_boxes.size()[0]  #每一个level_layer有多少个box
        ind = Variable(torch.zeros(per_level_boxes_num),requires_grad=False).int()#每一个level层级的roi的个数,生成的一个行向量
        if level_boxes.is_cuda:
            ind = ind.cuda()
        feature_maps[i] = feature_maps[i].unsqueeze(0) #依次取出各个特征层的特征图,并且增加一个维度[1,1,c,h,w]           
                                                       #CropAndResizeFunction needs batch dimension
                                                       
        pooled_features = CropAndResizeFunction(pool_size, pool_size, 0)(feature_maps[i], level_boxes, ind) 
                                                       #输出的维度为(per_level_boxes_num,c,pool_size,pool_size)
        pooled.append(pooled_features)

    # Pack pooled features into one tensor
    pooled = torch.cat(pooled, dim=0)   #横向拼接  # (num_rois,c,pool_size,pool_size)

    # Pack box_to_level mapping into one array and add another
    # column representing the order of pooled boxes
    box_to_level = torch.cat(box_to_level, dim=0)    

    # Rearrange pooled features to match the order of the original boxes
    _, box_to_level = torch.sort(box_to_level)  #将因为pooling操作时是按照layer_level来处理的,
                                                #所以打乱了原来roi安装scores的高度排列的顺序
                                                #现在又按照rois原来的顺序来排列
                                                 
    pooled = pooled[box_to_level, :, :, :]  #第一维是boxes的数量 (boxes_num,c,7,7)

    return pooled


############################################################
#  Detection Target Layer
############################################################
def bbox_overlaps(boxes1, boxes2):
    """Computes IoU overlaps between two sets of boxes.
    boxes1, boxes2: [N, (y1, x1, y2, x2)].
    """
    # 1. Tile boxes2 and repeate boxes1. This allows us to compare
    # every boxes1 against every boxes2 without loops.
    # TF doesn't have an equivalent to np.repeate() so simulate it
    # using tf.tile() and tf.reshape.
    boxes1_repeat = boxes2.size()[0]   #box1有多少个框
    boxes2_repeat = boxes1.size()[0]   #box2有多少个框
    boxes1 = boxes1.repeat(1,boxes1_repeat).view(-1,4)  #一行一行的复制,先将第一行复制boxes1_repeat份,
                                                        #然后将第二行复制boxes1_repeat份,依次类推
                                                      
    boxes2 = boxes2.repeat(boxes2_repeat,1)  #整体赋值boxes2_repeat份

    # 2. Compute intersections
    b1_y1, b1_x1, b1_y2, b1_x2 = boxes1.chunk(4, dim=1) #这样分块能保持原数据的维度数量
    b2_y1, b2_x1, b2_y2, b2_x2 = boxes2.chunk(4, dim=1)  
    y1 = torch.max(b1_y1, b2_y1)[:, 0]  #返回比较大小的结果,经过[:,0]得到一个一维行向量
    x1 = torch.max(b1_x1, b2_x1)[:, 0]
    y2 = torch.min(b1_y2, b2_y2)[:, 0]
    x2 = torch.min(b1_x2, b2_x2)[:, 0]
    zeros = Variable(torch.zeros(y1.size()[0]), requires_grad=False)
    if y1.is_cuda:
        zeros = zeros.cuda()
    intersection = torch.max(x2 - x1, zeros) * torch.max(y2 - y1, zeros)

    # 3. Compute unions
    b1_area = (b1_y2 - b1_y1) * (b1_x2 - b1_x1)
    b2_area = (b2_y2 - b2_y1) * (b2_x2 - b2_x1)
    union = b1_area[:,0] + b2_area[:,0] - intersection

    # 4. Compute IoU and reshape to [boxes1, boxes2]
    iou = intersection / union
    overlaps = iou.view(boxes2_repeat, boxes1_repeat)

    return overlaps
    
  
def ious(boxes1,boxes2):
    '''
    inputs:
        boxes1: [n,4]
        boxes2: [m,4]
    outputs:
        [n,m]
    '''
    
    b1_y1,b1_x1,b1_y2,b1_x2=boxes1[:,0],boxes1[:,1],boxes1[:,2],boxes1[:,3]
    b2_y1,b2_x1,b2_y2,b2_x2=boxes2[:,0],boxes2[:,1],boxes2[:,2],boxes2[:,3]
   
    y1=torch.max(b1_y1.unsqueeze(1),b2_y1)
    x1=torch.max(b1_x1.unsqueeze(1),b2_x1)
  • 0
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
Mask RCNN 是基于Kaiming 之前的工作 FPN (Feature Pyramid Network) 很形象地说就是用FPN产生的检测结果, 后面加了一个分割的网络. 文章中用到了 Top-Down + Bottom-Up 最近很流行的多层网络, 因为最开始Faster-RCNN只是在最后一层上面检测, 很容易丢掉小目标物体, 并且对细节遮挡也很不敏感. 最近的趋势就是结合多层 特征, 答主孔涛就很早发现了这个insight, 做出了HyperNet 并中了CVPR roal!!!作者:Oh233 链接:https://www.zhihu.com/question/57403701/answer/153060743 来源:知乎 著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。 Mask R-CNN 这个结果确实很强,但要同时注意它主要是加上了许多(都是很有用的)engineering techniques 。 比如说 anchor 从 12 增加到了15个,图像 size 从600变成了800,还有ROI batch size变到了512,从FPN那篇paper来看,这些 better practice 会有对性能十分明显的提升 (table 3 baseline: AP=26.3 -> 31.6)。而我们组16年的coco分割竞赛冠军 ,Fully Convolutional Instance-aware Semantic Segmentation (FCIS)的代码昨晚终于开源了。限于计算资源,我们并没有加上这些改进。这些改进应该是比较 general 的,也会适用于 FCIS。欢迎大家试用一波。FCIS 提供了一种简单高效的框架去解决 instance segmentation 的问题。跟之前 COCO 2015 的冠军 MNC 相比,它的主要不同在于 mask estimation 和 detection 是共同做的,而不是先估计 mask 再做 detection。在 FCIS 中 detection/mask estimation 之间通过 inside/outside score map 互相影响,利用了这两个紧密相连 task 之间的共性。现在 release 版本基于支持多卡训练的MXNet,msracver/FCIS。实际上大概今年一月份我们就已经写出了外面可以使用的Caffe版本,但是当时官方 Caffe 只支持单卡做复杂任务的训练,对于COCO这种大规模数据集来说用单卡训练的话一下子几周的时间就过去了。考虑到大家用起来会非常蛋疼,最后还是决定没有release这个版本。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值