Mask_rcnn中文详解

最新推荐文章于 2020-09-27 15:09:24 发布
头号大眼睛
最新推荐文章于 2020-09-27 15:09:24 发布
阅读量1.8k
点赞数
分类专栏：机器学习 pytorch 文章标签： pytorch 神经网络
本文链接：https://blog.csdn.net/ZzH7HN/article/details/104932534
版权
一.在学习Mask之前，建议先看看faster_rcnn,(faster_rcnn代码解读)
Mask_rcnn关键技术：
1.多尺度检测(最早在yolo3中使用)，里面用到了FPN技术
2.rpn
2.ROI Align
二：系统学习mask_rcnn过程，B站视频讲解
三：代码中文注释
model.py
"""
Mask R-CNN
The main Mask R-CNN model implemenetation.
Copyright (c) 2017 Matterport, Inc.
Licensed under the MIT License (see LICENSE for details)
Written by Waleed Abdulla
"""

import datetime
import math
import os
import random
import re

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

import utils
import visualize
#from nms.nms_wrapper import nms
from roialign.roi_align.crop_and_resize import CropAndResizeFunction

############################################################
# nms
############################################################
# boxes=np.array([[100,100,210,210,0.72],
        # [250,250,420,420,0.8],
        # [220,220,320,330,0.92],
        # [100,100,210,210,0.72],
        # [230,240,325,330,0.81],
        # [220,230,315,340,0.9]]) 
def nms(dets, thresh):
    # dets:(m,5)  thresh:scaler
    x1 = dets[:,0]
    y1 = dets[:,1]
    x2 = dets[:,2]
    y2 = dets[:,3]
    areas = (y2-y1+1) * (x2-x1+1)
    scores = dets[:,4]
    keep = []
    index = scores.argsort()[::-1]
    while index.size >0:
        i = index[0]       # every time the first is the biggst, and add it directly
        keep.append(i)
        x11 = np.maximum(x1[i], x1[index[1:]])    # calculate the points of overlap 
        y11 = np.maximum(y1[i], y1[index[1:]])
        x22 = np.minimum(x2[i], x2[index[1:]])
        y22 = np.minimum(y2[i], y2[index[1:]])
        w = np.maximum(0, x22-x11+1)    # the weights of overlap
        h = np.maximum(0, y22-y11+1)    # the height of overlap
        overlaps = w*h
        ious = overlaps / (areas[i]+areas[index[1:]] - overlaps)
        idx = np.where(ious<=thresh)[0]
        index = index[idx+1]   # because index start from 1
    return keep
# import matplotlib.pyplot as plt
# def plot_bbox(dets, c='k'):
    
    # x1 = dets[:,0]
    # y1 = dets[:,1]
    # x2 = dets[:,2]
    # y2 = dets[:,3]
    
    # plt.plot([x1,x2], [y1,y1], c)
    # plt.plot([x1,x1], [y1,y2], c)
    # plt.plot([x1,x2], [y2,y2], c)
    # plt.plot([x2,x2], [y1,y2], c)
    # plt.title("after nms")  

############################################################
#  Logging Utility Functions
############################################################

def log(text, array=None):
    """Prints a text message. And, optionally, if a Numpy array is provided it
    prints it's shape, min, and max values.
    if array is not None  shape: (m,n)
    """
    if array is not None:
        text = text.ljust(25) #方法返回一个原字符串左对齐,并使用空格填充至指定长度的新字符串。如果指定的长度小于原字符串的长度则返回原字符串
        text += ("shape: {:20}  min: {:10.5f}  max: {:10.5f}".format(
            str(array.shape), #将一个元组变成一个字符串 (m,n)-> '(m,n)'
            array.min() if array.size else "",  #array.size返回矩阵的元素数量m*n, array.min()返回数组最小值
            array.max() if array.size else ""))
    print(text)

def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█'):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = '\n')
    # Print New Line on Complete
    if iteration == total:
        print()


############################################################
#  Pytorch Utility Functions
############################################################

def unique1d(tensor):
    if tensor.size()[0] == 0 or tensor.size()[0] == 1:
        return tensor
    tensor = tensor.sort()[0]  #对tensor每行进行排序
    unique_bool = tensor[1:] != tensor [:-1]  #检测除了第一行和最后一行的数据之外的数据
    first_element = Variable(torch.ByteTensor([True]), requires_grad=False).bool()
    if tensor.is_cuda:
        first_element = first_element.cuda()
    unique_bool = torch.cat((first_element, unique_bool),dim=0)
    return tensor[unique_bool.data]

#求两个一维度的行向量的交集
def intersect1d(tensor1, tensor2):  
    assert len(tensor1.shape)==1 and len(tensor2.shape)==1 and len(tensor1)>1 and len(tensor2)>1,"输入的维度为1且数据长度大于1"
    aux = torch.cat((tensor1, tensor2),dim=0)
    aux = aux.sort()[0]
    return aux[:-1][(aux[1:] == aux[:-1]).data]

def log2(x):
    """Implementatin of Log2. Pytorch doesn't have a native implemenation."""
    ln2 = Variable(torch.log(torch.FloatTensor([2.0])), requires_grad=False)
    if x.is_cuda:
        ln2 = ln2.cuda()
    return torch.log(x) / ln2

class SamePad2d(nn.Module):
    """Mimics tensorflow's 'SAME' padding.
    """

    def __init__(self, kernel_size, stride):
        super(SamePad2d, self).__init__()
        self.kernel_size = torch.nn.modules.utils._pair(kernel_size)  #函数将输入变成成对的元组
        self.stride = torch.nn.modules.utils._pair(stride) 

    def forward(self, input):
        #input (batch,c,h,w)
        in_width = input.size()[3]
        in_height = input.size()[2]
        out_width = math.ceil(float(in_width) / float(self.stride[0]))  #向上取整
        out_height = math.ceil(float(in_height) / float(self.stride[1]))
        pad_along_width = ((out_width - 1) * self.stride[0] +
                           self.kernel_size[0] - in_width)
        pad_along_height = ((out_height - 1) * self.stride[1] +
                            self.kernel_size[1] - in_height)       #这里用到out_w=(in_w +pad-k+s)/s 反向求出pad
        pad_left = math.floor(pad_along_width / 2)  #向下取整
        pad_top = math.floor(pad_along_height / 2)
        pad_right = pad_along_width - pad_left
        pad_bottom = pad_along_height - pad_top
        return F.pad(input, (pad_left, pad_right, pad_top, pad_bottom), 'constant', 0)

    def __repr__(self):
        return self.__class__.__name__


############################################################
#  FPN Graph
############################################################


class TopDownLayer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(TopDownLayer, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1)  #将通道压缩
        self.padding2 = SamePad2d(kernel_size=3, stride=1)   #初始化类SamePad2d,在特征图周围用0填充
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,stride=1)

    def forward(self, x, y):
        y = F.upsample(y, scale_factor=2) #将y的宽高扩充到2倍
        x = self.conv1(x)                 #改变x的通道数
        return self.conv2(self.padding2(x+y))  #先将x与y融合后的特征,再用0填充,保证卷积前后宽度不变
          
class FPN(nn.Module):
    def __init__(self, C1, C2, C3, C4, C5, out_channels):
        super(FPN, self).__init__()
        self.out_channels = out_channels  #每层输出的通道数相同
        self.C1 = C1    #卷积操作
        self.C2 = C2
        self.C3 = C3
        self.C4 = C4
        self.C5 = C5
        self.P6 = nn.MaxPool2d(kernel_size=1, stride=2)
        
        self.P5_conv1 = nn.Conv2d(2048, self.out_channels, kernel_size=1, stride=1)
        self.P5_conv2 = nn.Sequential(
            SamePad2d(kernel_size=3, stride=1), #先将特征图padding,用于下面的卷积操作
            nn.Conv2d(self.out_channels, self.out_channels, kernel_size=3, stride=1), #消除混叠效应
        )
        self.P4_conv1 =  nn.Conv2d(1024, self.out_channels, kernel_size=1, stride=1)
        self.P4_conv2 = nn.Sequential(
            SamePad2d(kernel_size=3, stride=1),
            nn.Conv2d(self.out_channels, self.out_channels, kernel_size=3, stride=1),
        )
        self.P3_conv1 = nn.Conv2d(512, self.out_channels, kernel_size=1, stride=1)
        self.P3_conv2 = nn.Sequential(
            SamePad2d(kernel_size=3, stride=1),
            nn.Conv2d(self.out_channels, self.out_channels, kernel_size=3, stride=1),
        )
        self.P2_conv1 = nn.Conv2d(256, self.out_channels, kernel_size=1, stride=1)
        self.P2_conv2 = nn.Sequential(
            SamePad2d(kernel_size=3, stride=1),
            nn.Conv2d(self.out_channels, self.out_channels, kernel_size=3, stride=1),
        )

    def forward(self, x):
        x = self.C1(x)
        x = self.C2(x)
        c2_out = x
        x = self.C3(x)
        c3_out = x
        x = self.C4(x)
        c4_out = x
        x = self.C5(x)
        p5_out = self.P5_conv1(x)
        p4_out = self.P4_conv1(c4_out) + F.upsample(p5_out, scale_factor=2)
        p3_out = self.P3_conv1(c3_out) + F.upsample(p4_out, scale_factor=2)
        p2_out = self.P2_conv1(c2_out) + F.upsample(p3_out, scale_factor=2)

        p5_out = self.P5_conv2(p5_out)
        p4_out = self.P4_conv2(p4_out)
        p3_out = self.P3_conv2(p3_out)
        p2_out = self.P2_conv2(p2_out)

        # P6 is used for the 5th anchor scale in RPN. Generated by
        # subsampling from P5 with stride of 2.
        p6_out = self.P6(p5_out)

        return [p2_out, p3_out, p4_out, p5_out, p6_out]


############################################################
#  Resnet Graph
############################################################
class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride)  
        self.bn1 = nn.BatchNorm2d(planes, eps=0.001, momentum=0.01) 
        self.padding2 = SamePad2d(kernel_size=3, stride=1)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3)
        self.bn2 = nn.BatchNorm2d(planes, eps=0.001, momentum=0.01)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1)   
        self.bn3 = nn.BatchNorm2d(planes * 4, eps=0.001, momentum=0.01) 
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x
        #如果x是 (n,inplanes,h,w)
        out = self.conv1(x)  #通道压缩  (n,planes,h,w)
        out = self.bn1(out)  #归一化    (n,planes,h,w)
        out = self.relu(out) #非线性化  (n,planes,h,w)

        out = self.padding2(out)  #添加padding ((n,planes,h+pad,w+pad))
        out = self.conv2(out)     #卷积调整宽高 (n,planes,h,w)
        out = self.bn2(out)       #(n,planes,h,w)
        out = self.relu(out)      #(n,planes,h,w)

        out = self.conv3(out)     #(n,4*planes,h,w)
        out = self.bn3(out)       #(n,4*planes,h,w)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out
        
class ResNet(nn.Module):

    def __init__(self, architecture, stage5=False):
        super(ResNet, self).__init__()
        assert architecture in ["resnet50", "resnet101"]
        self.inplanes = 64
        self.layers = [3, 4, {"resnet50": 6, "resnet101": 23}[architecture], 3]
        self.block = Bottleneck
        self.stage5 = stage5

        self.C1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
            nn.BatchNorm2d(64, eps=0.001, momentum=0.01),
            nn.ReLU(inplace=True),
            SamePad2d(kernel_size=3, stride=2),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.C2 = self.make_layer(self.block, 64, self.layers[0])
        self.C3 = self.make_layer(self.block, 128, self.layers[1], stride=2)
        self.C4 = self.make_layer(self.block, 256, self.layers[2], stride=2)
        if self.stage5:
            self.C5 = self.make_layer(self.block, 512, self.layers[3], stride=2)
        else:
            self.C5 = None

    def forward(self, x):
        x = self.C1(x)
        x = self.C2(x)
        x = self.C3(x)
        x = self.C4(x)
        x = self.C5(x)
        return x


    def stages(self):
        return [self.C1, self.C2, self.C3, self.C4, self.C5]

    def make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride),
                nn.BatchNorm2d(planes * block.expansion, eps=0.001, momentum=0.01),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)


############################################################
#  Proposal Layer
############################################################

def apply_box_deltas(boxes, deltas):
    """Applies the given deltas to the given boxes.
    boxes: [N, 4] where each row is y1, x1, y2, x2
    deltas: [N, 4] where each row is [dy, dx, log(dh), log(dw)]
    """
    # Convert to y, x, h, w
    height = boxes[:, 2] - boxes[:, 0]
    width = boxes[:, 3] - boxes[:, 1]
    center_y = boxes[:, 0] + 0.5 * height
    center_x = boxes[:, 1] + 0.5 * width
    # Apply deltas
    center_y += deltas[:, 0] * height
    center_x += deltas[:, 1] * width
    height *= torch.exp(deltas[:, 2])
    width *= torch.exp(deltas[:, 3])
    # Convert back to y1, x1, y2, x2
    y1 = center_y - 0.5 * height
    x1 = center_x - 0.5 * width
    y2 = y1 + height
    x2 = x1 + width
    result = torch.stack([y1, x1, y2, x2], dim=1)
    return result

def clip_boxes(boxes, window):
    """
    boxes: [N, 4] each col is y1, x1, y2, x2
    window: [4] in the form y1, x1, y2, x2
    """
    boxes = torch.stack( \
        [boxes[:, 0].clamp(float(window[0]), float(window[2])),
         boxes[:, 1].clamp(float(window[1]), float(window[3])),
         boxes[:, 2].clamp(float(window[0]), float(window[2])),
         boxes[:, 3].clamp(float(window[1]), float(window[3]))], 1)
    return boxes

def proposal_layer(inputs, proposal_count, nms_threshold, anchors, config=None):
    """Receives anchor scores and selects a subset to pass as proposals
    to the second stage. Filtering is done based on anchor scores and
    non-max suppression to remove overlaps. It also applies bounding
    box refinment detals to anchors.
    anchors: (anchors_num,4) 它是在图片尺寸维度的
    Inputs:
        rpn_probs: [batch, anchors_num, (bg prob, fg prob)]
        rpn_bbox: [batch, anchors_num, (dy, dx, log(dh), log(dw))]
    Returns:
        Proposals in normalized coordinates [batch, rois, (y1, x1, y2, x2)]
    """

    # Currently only supports batchsize 1
    # inputs是一个list [rpn_probs,rpn_bbox]
    # inputs[0] 是rpn_probs (batch,anchors_num,2)
    # inputs[1] 是rpn_bbox  (batch,anchors_num,4)
    
    inputs[0] = inputs[0].squeeze(0)  #去掉batch_size那个维度,因为每个batch只支持一张图片
    inputs[1] = inputs[1].squeeze(0)

    # Box Scores. Use the foreground class confidence. [Batch, num_rois, 1]
    scores = inputs[0][:, 1]

    # Box deltas [batch, num_rois, 4]
    deltas = inputs[1] #获得deltals,这是每张图片在第一阶段的预测输出
    #RPN_BBOX_STD_DEV = np.array([0.1, 0.1, 0.2, 0.2])
    std_dev = Variable(torch.from_numpy(np.reshape(config.RPN_BBOX_STD_DEV, [1, 4])).float(), requires_grad=False)
    if config.GPU_COUNT:   #GPU_COUNT=1,则表示使用GPU,为0,则表示使用CPU
        std_dev = std_dev.cuda()
    deltas = deltas * std_dev

    # Improve performance by trimming to top anchors by score
    # and doing the rest on the smaller subset.
    pre_nms_limit = min(6000, anchors.size()[0])   #在进行nms之前取出多少个anchors
    scores, order = scores.sort(descending=True)   #将scores进行倒序排序，返回排序后的结果，以及他们在原序列的的index
    order = order[:pre_nms_limit]                  #切片前pre_nms_limit个scores
    scores = scores[:pre_nms_limit]                #切片  (pre_nms_limit,)
    deltas = deltas[order.data, :] # TODO: Support batch size > 1 ff.  #取出scores排名前pre_nms_limit的deltas
    anchors = anchors[order.data, :]               #同上   

    # Apply deltas to anchors to get refined anchors.
    # [batch, N, (y1, x1, y2, x2)]
    boxes = apply_box_deltas(anchors, deltas)      #根据输出偏差对anchors进行修正
                                                   #将取出来的deltas进行变换成方框的左上角和右下角坐标(pre_nms_limit,4)
                                                   #(pre_nms_limit,(y1, x1, y2, x2))

    # Clip to image boundaries. [batch, N, (y1, x1, y2, x2)]
    height, width = config.IMAGE_SHAPE[:2]         #图片的边界
    window = np.array([0, 0, height, width]).astype(np.float32)  
    boxes = clip_boxes(boxes, window)       #将boxs限制在图片边界内

    # Filter out small boxes
    # According to Xinlei Chen's paper, this reduces detection accuracy
    # for small objects, so we're skipping it.

    # Non-max suppression
    #torch.cat((boxes, scores.unsqueeze(1)), 1)将scores增加一个维度变成(pre_nms_limit,1),
    #与boxes (pre_nms_limit,4)在第dims=1的维度上拼接变成维度为(pre_nms_limit,5)
    keep = nms(torch.cat((boxes, scores.unsqueeze(1)), 1).data, nms_threshold)#keep是一个list，保存的是经过nms后剩下来的box的index
    if len(keep)>proposal_count:
        keep = keep[:proposal_count]
    boxes = boxes[keep, :]

    # Normalize dimensions to range of 0 to 1.
    norm = Variable(torch.from_numpy(np.array([height, width, height, width])).float(), requires_grad=False)
    if config.GPU_COUNT:
        norm = norm.cuda()
    normalized_boxes = boxes / norm  #将boxs归一化

    # Add back batch dimension
    normalized_boxes = normalized_boxes.unsqueeze(0)  #增加一个维度 (proposal_count,4)--> (1,proposal_count,4)

    return normalized_boxes


############################################################
#  ROIAlign Layer
############################################################

def pyramid_roi_align(inputs, pool_size, image_shape):
    """Implements ROI Pooling on multiple levels of the feature pyramid.
    Params:
    - pool_size: [height, width] of the output pooled regions. Usually [7, 7]
    - image_shape: [channels,height, width]. Shape of input image in pixels  
    Inputs:
    - boxes: [batch, num_boxes, (y1, x1, y2, x2)] in normalized
             coordinates.
    - Feature maps: List of feature maps from different levels of the pyramid.
                    Each is [batch, channels, height, width]
    Output:
    Pooled regions in the shape: [num_boxes, channels, height, width,].
    The width and height are those specific in the pool_shape in the layer
    constructor.
    """

    # Currently only supports batchsize 1
    for i in range(len(inputs)):
        inputs[i] = inputs[i].squeeze(0)  #去掉batch_size那个维度

    # Crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coords
    boxes = inputs[0]  #(num_boxes,4)

    # Feature Maps. List of feature maps from different level of the
    # feature pyramid. Each is [batch,channels, height, width]
    feature_maps = inputs[1:]  #[p2,p3,p4,p5,p6]

    # Assign each ROI to a level in the pyramid based on the ROI area.
    y1, x1, y2, x2 = boxes.chunk(4, dim=1)  #将boxes沿着类方向分成四个块
    h = y2 - y1        
    w = x2 - x1

    # Equation 1 in the Feature Pyramid Networks paper. Account for
    # the fact that our coordinates are normalized here.
    # a 224x224 ROI (in pixels) maps to P4
    image_area = Variable(torch.FloatTensor([float(image_shape[1]*image_shape[2])]), requires_grad=False)
    if boxes.is_cuda:  
        image_area = image_area.cuda()
    roi_level = 4 + log2(torch.sqrt(h*w)/(224.0/torch.sqrt(image_area)))  #选在哪个特征输出层上进行ROI_pooling 
    roi_level = roi_level.round().int()   #round()向下取整
    roi_level = roi_level.clamp(2,5)   #FPN产生了[P2,P3,P4,P5,P6]五个特征层，但是只有[P2,P3,P4,P5]进行了roi_pooling


    # Loop through levels and apply ROI pooling to each. P2 to P5.
    pooled = []
    box_to_level = []
    for i, level in enumerate(range(2, 6)):  #i[0,1,2,3] level[2,3,4,5]
        ix  = roi_level==level   #bool
        if not ix.any(): #any() 函数用于判断给定的可迭代参数 iterable 是否全部为 False，则返回 False，如果有一个为 True，则返回 True
            continue
        assert len(ix.shape)>1,"ix的维度<=1"
        ix = torch.nonzero(ix)[:,0] #torch.nonzero() 输出的非零元素在矩阵中的下标矩阵 维度(m,n1,n2,...)
                                    #m是非零元素的个数，n1,n2,...则是ix除了第一维度的其他维度(对于一维的行向量,则为1)
                                    #假设 ix=tensor([0,1,0,1])  >>> 输出tensor([[1],[3]])  维度(2,1)
                                    #假设 ix=tensor([[0,1,0,1],[0,1,0,1]]) >>>输出tensor([[0,1],[0,3],[1,0],[1,3]]) 维度(4,2)
        #因为ix的维度是[n,1],torch.nonzero(ix)生成的维度是(m,2),而通过[:,0],生成的则是,ix中不为0的index
        level_boxes = boxes[ix.data, :]   #分别取出roi_level为2,3,4,5的roi
        # Keep track of which box is mapped to which level
        box_to_level.append(ix.data) #将roi的roi_level为2,3,4,5的index依次append到box_to_level中

        # Stop gradient propogation to ROI proposals
        level_boxes = level_boxes.detach()  #将抽离导数计算图，也就是说在进行反向传播是不进行求导

        # Crop and Resize
        # From Mask R-CNN paper: "We sample four regular locations, so
        # that we can evaluate either max or average pooling. In fact,
        # interpolating only a single value at each bin center (without
        # pooling) is nearly as effective."
        #
        # Here we use the simplified approach of a single value per bin,
        # which is how it's done in tf.crop_and_resize()
        # Result: [batch * num_boxes, pool_height, pool_width, channels]
        per_level_boxes_num=level_boxes.size()[0]  #每一个level_layer有多少个box
        ind = Variable(torch.zeros(per_level_boxes_num),requires_grad=False).int()#每一个level层级的roi的个数,生成的一个行向量
        if level_boxes.is_cuda:
            ind = ind.cuda()
        feature_maps[i] = feature_maps[i].unsqueeze(0) #依次取出各个特征层的特征图,并且增加一个维度[1,1,c,h,w]           
                                                       #CropAndResizeFunction needs batch dimension
                                                       
        pooled_features = CropAndResizeFunction(pool_size, pool_size, 0)(feature_maps[i], level_boxes, ind) 
                                                       #输出的维度为(per_level_boxes_num,c,pool_size,pool_size)
        pooled.append(pooled_features)

    # Pack pooled features into one tensor
    pooled = torch.cat(pooled, dim=0)   #横向拼接  # (num_rois,c,pool_size,pool_size)

    # Pack box_to_level mapping into one array and add another
    # column representing the order of pooled boxes
    box_to_level = torch.cat(box_to_level, dim=0)    

    # Rearrange pooled features to match the order of the original boxes
    _, box_to_level = torch.sort(box_to_level)  #将因为pooling操作时是按照layer_level来处理的，
                                                #所以打乱了原来roi安装scores的高度排列的顺序
                                                #现在又按照rois原来的顺序来排列
                                                 
    pooled = pooled[box_to_level, :, :, :]  #第一维是boxes的数量 (boxes_num,c,7,7)

    return pooled


############################################################
#  Detection Target Layer
############################################################
def bbox_overlaps(boxes1, boxes2):
    """Computes IoU overlaps between two sets of boxes.
    boxes1, boxes2: [N, (y1, x1, y2, x2)].
    """
    # 1. Tile boxes2 and repeate boxes1. This allows us to compare
    # every boxes1 against every boxes2 without loops.
    # TF doesn't have an equivalent to np.repeate() so simulate it
    # using tf.tile() and tf.reshape.
    boxes1_repeat = boxes2.size()[0]   #box1有多少个框
    boxes2_repeat = boxes1.size()[0]   #box2有多少个框
    boxes1 = boxes1.repeat(1,boxes1_repeat).view(-1,4)  #一行一行的复制,先将第一行复制boxes1_repeat份,
                                                        #然后将第二行复制boxes1_repeat份，依次类推
                                                      
    boxes2 = boxes2.repeat(boxes2_repeat,1)  #整体赋值boxes2_repeat份

    # 2. Compute intersections
    b1_y1, b1_x1, b1_y2, b1_x2 = boxes1.chunk(4, dim=1) #这样分块能保持原数据的维度数量
    b2_y1, b2_x1, b2_y2, b2_x2 = boxes2.chunk(4, dim=1)  
    y1 = torch.max(b1_y1, b2_y1)[:, 0]  #返回比较大小的结果,经过[:,0]得到一个一维行向量
    x1 = torch.max(b1_x1, b2_x1)[:, 0]
    y2 = torch.min(b1_y2, b2_y2)[:, 0]
    x2 = torch.min(b1_x2, b2_x2)[:, 0]
    zeros = Variable(torch.zeros(y1.size()[0]), requires_grad=False)
    if y1.is_cuda:
        zeros = zeros.cuda()
    intersection = torch.max(x2 - x1, zeros) * torch.max(y2 - y1, zeros)

    # 3. Compute unions
    b1_area = (b1_y2 - b1_y1) * (b1_x2 - b1_x1)
    b2_area = (b2_y2 - b2_y1) * (b2_x2 - b2_x1)
    union = b1_area[:,0] + b2_area[:,0] - intersection

    # 4. Compute IoU and reshape to [boxes1, boxes2]
    iou = intersection / union
    overlaps = iou.view(boxes2_repeat, boxes1_repeat)

    return overlaps
    
  
def ious(boxes1,boxes2):
    '''
    inputs:
        boxes1: [n,4]
        boxes2: [m,4]
    outputs:
        [n,m]
    '''
    
    b1_y1,b1_x1,b1_y2,b1_x2=boxes1[:,0],boxes1[:,1],boxes1[:,2],boxes1[:,3]
    b2_y1,b2_x1,b2_y2,b2_x2=boxes2[:,0],boxes2[:,1],boxes2[:,2],boxes2[:,3]
   
    y1=torch.max(b1_y1.unsqueeze(1),b2_y1)
    x1=torch.max(b1_x1.unsqueeze(1),b2_x1)
最低0.47元/天解锁文章
头号大眼睛
关注
0
点赞
踩
8

收藏

觉得还不错? 一键收藏
0
评论
Mask_rcnn中文详解

一.在学习Mask之前，建议先看看faster_rcnn,(faster_rcnn代码解读)Mask_rcnn关键技术：1.多尺度检测(最早在yolo3中使用)，里面用到了FPN技术2.rpn2.ROI Align二：系统学习mask_rcnn过程，B站视频讲解三：代码中文注释model.py"""Mask R-CNNThe main Mask R-CNN m...
复制链接

扫一扫