项目目录:
制作自己的数据集
运行&解析小训练脚本
解析Mask-RCNN:
了解mask-rcnn
上一节我们已经说是如何使用代码进行训练以及验证,接下来要进一步的了解代码的结构
进入mask-rcnn\libraries\mrcnn\model.py:
这里的代码量极大,差不多有3000行全部看懂是相当费劲的
我们先从主类Mask-RCNN看起
参数集成在config,这是config所有参数的解析,当然看不懂代码结构这些参数是不知道怎么用的。
class Config(object):
# 名称的配置。例如,'COCO','Experiment3',等等。
#如果您的代码需要根据正在运行的实验以不同的方式执行某些操作,那么这将非常有用。
NAME = None # Override in sub-classes
# 要使用的gpu数量。当只使用CPU时,需要将该值设置为1。
GPU_COUNT = 1
# 每个GPU上要训练的图像数量。一个12GB的GPU通常可以处理2张1024x1024px的图像。
#根据您的GPU内存和图像大小调整。使用您的GPU可以处理的最高数字,以获得最佳性能。
IMAGES_PER_GPU = 2
# 每个历元的训练步骤数这并不需要与训练集的大小匹配。Tensorboard更新在每个历元的末尾保存,
# 因此将其设置为一个较小的数字意味着获得更频繁的Tensorboard更新。验证统计信息也会在每个历元结束时计算,
# 它们可能需要一段时间,所以不要设置得太小,以免在验证统计信息上花费大量时间。
STEPS_PER_EPOCH = 1000
# 在每个培训周期结束时运行的验证步骤的数量。
# 数字越大,验证统计数据的准确性就越高,但会降低培训的速度。
VALIDATION_STEPS = 50
# 主干网架构 支持的值是: resnet50、resnet101。
# 您还可以提供一个应该具有model.resnet_graph签名的可调用函数。如果这样做,
#还需要提供一个可调用的to COMPUTE_BACKBONE_SHAPE
BACKBONE = "resnet101"
# 只有当你提供一个可调用的主干时才有用。应计算FPN金字塔的每一层的形状。看到model.compute_backbone_shapes
COMPUTE_BACKBONE_SHAPE = None
# FPN金字塔各层的步长 这些值 基于Resnet101主干
# 在生成anchors的时候用到RPN
BACKBONE_STRIDES = [4, 8, 16, 32, 64]
# 分类图中全连通层的大小
FPN_CLASSIF_FC_LAYERS_SIZE = 1024
# 用于构建特性金字塔的自顶向下层的大小 主网络用
TOP_DOWN_PYRAMID_SIZE = 256
# 分类类数(含背景)
NUM_CLASSES = 1
# 正方形锚边的长度(以像素为单位) 在生成anchors的时候用到RPN
RPN_ANCHOR_SCALES = (32, 64, 128, 256, 512)
# 每个单元的锚固比率(宽/高)
# 就是1:2和2:1的高宽锚
RPN_ANCHOR_RATIOS = [0.5, 1, 2]
# Anchor stride
# If 1,则为主干功能映射中的每个单元创建锚。
# If 2,然后为每个其他单元格创建锚点,以此类推。
RPN_ANCHOR_STRIDE = 1 # 在生成anchors的时候用到RPN
# 非最大抑制阈值过滤RPN提案
# 您可以在培训期间增加此功能以生成更多的建议。
RPN_NMS_THRESHOLD = 0.7
# How many anchors per image to use for RPN training
# 每个图像使用多少个roi进行RPN训练 数据增强中用到
RPN_TRAIN_ANCHORS_PER_IMAGE = 256
# roi在tf.nn之后保留。top_k和非最大抑制之前
PRE_NMS_LIMIT = 6000
# ROIs kept after non-maximum suppression (training and inference)
POST_NMS_ROIS_TRAINING = 2000
POST_NMS_ROIS_INFERENCE = 1000
# 如果启用,则将实例掩码调整为较小的大小以便减小
# 内存负载。建议使用高分辨率图像
USE_MINI_MASK = True
MINI_MASK_SHAPE = (56, 56) # (height, width) of the mini-mask 加载图片时用
# Input image resizing
# Generally, use the "square" resizing mode for training and predicting
# and it should work well in most cases. In this mode, images are scaled
# up such that the small side is = IMAGE_MIN_DIM, but ensuring that the
# scaling doesn't make the long side > IMAGE_MAX_DIM. Then the image is
# padded with zeros to make it a square so multiple images can be put
# in one batch.
# Available resizing modes:
# none: No resizing or padding. Return the image unchanged.
# square: Resize and pad with zeros to get a square image
# of size [max_dim, max_dim].
# pad64: Pads width and height with zeros to make them multiples of 64.
# If IMAGE_MIN_DIM or IMAGE_MIN_SCALE are not None, then it scales
# up before padding. IMAGE_MAX_DIM is ignored in this mode.
# The multiple of 64 is needed to ensure smooth scaling of feature
# maps up and down the 6 levels of the FPN pyramid (2**6=64).
# crop: Picks random crops from the image. First, scales the image based
# on IMAGE_MIN_DIM and IMAGE_MIN_SCALE, then picks a random crop of
# size IMAGE_MIN_DIM x IMAGE_MIN_DIM. Can be used in training only.
# IMAGE_MAX_DIM is not used in this mode.
IMAGE_RESIZE_MODE = "square"
IMAGE_MIN_DIM = 800
IMAGE_MAX_DIM = 1024
# Minimum scaling ratio. Checked after MIN_IMAGE_DIM and can force further
# up scaling. For example, if set to 2 then images are scaled up to double
# the width and height, or more, even if MIN_IMAGE_DIM doesn't require it.
# However, in 'square' mode, it can be overruled by IMAGE_MAX_DIM.
IMAGE_MIN_SCALE = 0
# Number of color channels per image. RGB = 3, grayscale = 1, RGB-D = 4
# Changing this requires other changes in the code. See the WIKI for more
# details: https://github.com/matterport/Mask_RCNN/wiki
IMAGE_CHANNEL_COUNT = 3
# Image mean (RGB) 像归一化
MEAN_PIXEL = np.array([123.7, 116.8, 103.9])
# 每幅图像要提供给分类器/掩码头的roi数目
# 掩码RCNN文件使用512,但通常RPN不会生成
# 足够多的积极建议来填补这个空缺,并保持积极:消极
# 1:3的比例。您可以通过调整来增加提案的数量
# the RPN NMS threshold.
TRAIN_ROIS_PER_IMAGE = 200
# 正roi的百分比用于训练分类器/掩模头
ROI_POSITIVE_RATIO = 0.33
# Pooled ROIs 在FPN中ROIAlign用到
POOL_SIZE = 7
MASK_POOL_SIZE = 14
# 输出掩模形状
# 要改变这一点,您还需要改变神经网络掩码分支
MASK_SHAPE = [28, 28]
# 在一个映像中使用的地面真相实例的最大数量 不懂
MAX_GT_INSTANCES = 100
# 边界盒细化标准偏差的RPN和最终检测。 分别是RPN生成216888anchor和FPN中用到的偏移量
RPN_BBOX_STD_DEV = np.array([0.1, 0.1, 0.2, 0.2])
BBOX_STD_DEV = np.array([0.1, 0.1, 0.2, 0.2])
# 最终探测到的最大数量 这个是inference中的FPN的数量,在训练中是200
DETECTION_MAX_INSTANCES = 100
# 接受检测到的实例的最小概率值
# 跳过此阈值以下的roi
# 过滤掉低置信度的框 置信度低于0.7的不要 inference FPN中用到
DETECTION_MIN_CONFIDENCE = 0.7
# 用于检测的非最大抑制阈值 inference FPN中用到
DETECTION_NMS_THRESHOLD = 0.3
# 学习率和动力掩模RCNN的论文使用lr=0.02,但是在TensorFlow上它会导致权重爆炸。可能是由于优化器的差异
LEARNING_RATE = 0.001
LEARNING_MOMENTUM = 0.9 # 冲量
# 体重衰变正规化 L2正则中用到
WEIGHT_DECAY = 0.0001
# 为更精确的优化损失权重。
# 可用于R-CNN训练设置
LOSS_WEIGHTS = {
"rpn_class_loss": 1.,
"rpn_bbox_loss": 1.,
"mrcnn_class_loss": 1.,
"mrcnn_bbox_loss": 1.,
"mrcnn_mask_loss": 1.
}
# 使用RPN roi或外部生成的roi进行培训
# 在大多数情况下都是这样。如果你想训练,设置为False head分支位于ROI上,
#ROI是由代码生成的,而不是由ROI生成的项。例如,不需要调试分类器头部训练项。
USE_RPN_ROIS = True
# Train or freeze batch normalization layers
# None: Train BN layers. This is the normal mode
# False: Freeze BN layers. Good when using a small batch size
# True: (don't use). Set layer in training mode even when predicting
TRAIN_BN = False # Defaulting to False since batch size is often small
# 梯度标准剪裁 SGD用到
GRADIENT_CLIP_NORM = 5.0
def __init__(self):
"""Set values of computed attributes."""
# Effective batch size
self.BATCH_SIZE = self.IMAGES_PER_GPU * self.GPU_COUNT # 1 * 1
# Input image size
if self.IMAGE_RESIZE_MODE == "crop":
self.IMAGE_SHAPE = np.array([self.IMAGE_MIN_DIM, self.IMAGE_MIN_DIM,
self.IMAGE_CHANNEL_COUNT])
else:
self.IMAGE_SHAPE = np.array([self.IMAGE_MAX_DIM, self.IMAGE_MAX_DIM,
self.IMAGE_CHANNEL_COUNT])
# Image meta data length
# See compose_image_meta() for details
self.IMAGE_META_SIZE = 1 + 3 + 3 + 4 + 1 + self.NUM_CLASSES
def display(self):
"""Display Configuration values."""
print("\nConfigurations:")
for a in dir(self):
if not a.startswith("__") and not callable(getattr(self, a)):
print("{:30} {}".format(a, getattr(self, a)))
print("\n")
model_dir和set_logs_dir分别为初始化权重路径训练保存路径
class MaskRCNN():
"""Encapsulates the Mask RCNN model functionality.
The actual Keras model is in the keras_model property.
"""
def __init__(self, mode, config, model_dir):
"""
mode: Either "training" or "inference"
“训练”或“推理”
也就是分为训练和测试
config: A Sub-class of the Config class
配置类的一个子类
这个config基本上包括了,整个代码使用的参数,通过调节这里面的参数就可以训练
model_dir: Directory to save training logs and trained weights
保存训练日志和训练权重的目录
"""
# 如果不是训练或者测试抛出异常
assert mode in ['training', 'inference']
self.mode = mode
self.config = config
self.model_dir = model_dir
self.set_log_dir()
self.keras_model = self.build(mode=mode, config=config)
接主:
判断输入图片维度是否符合,我们输入图片会经过处理。处理后最小的图片维度为128
def build(self, mode, config):
"""Build Mask R-CNN architecture.
构建蒙版R-CNN架构。
input_shape: The shape of the input image.
输入图像的形状。
mode: Either "training" or "inference". The inputs and
outputs of the model differ accordingly.
要么是“训练”,要么是“推理”。的输入和模型的输出也相应不同。
"""
# 如果不是训练或者测试抛出异常
assert mode in ['training', 'inference']
# Image size must be dividable by 2 multiple times
# 强制要求了图片裁剪后尺度为2^n,且n>=6,保证下采样后不产生小数
h, w = config.IMAGE_SHAPE[:2]
if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6):
raise Exception("Image size must be dividable by 2 at least 6 times "
"to avoid fractions when downscaling and upscaling."
"For example, use 256, 320, 384, 448, 512, ... etc. ")
接主:
如果是trian,初始化相关张量
# Inputs
'''
下面是keras的初始化张量
注意keras的习惯不同于placeholder,上面代码的shape没有包含batch
'''
# input_image 输入图片维度
# input_image_meta 图片的信息(包含形状、预处理信息等) 16
input_image = KL.Input(
shape=[None, None, 3], name="input_image")
input_image_meta = KL.Input(shape=[config.IMAGE_META_SIZE],
name="input_image_meta")
if mode == "training":
# input_rpn_match、input_rpn_bbox 计算loss时用到
input_rpn_match = KL.Input(
shape=[None, 1], name="input_rpn_match", dtype=tf.int32)
input_rpn_bbox = KL.Input(
shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32)
# 检测GT(类id、锚框和掩码)
input_gt_class_ids = KL.Input(
shape=[None], name="input_gt_class_ids", dtype=tf.int32)
# 2. GT box(像素)(零填充)占位
# [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates MAX_GT_INSTANCES=100
input_gt_boxes = KL.Input(
shape=[None, 4], name="input_gt_boxes", dtype=tf.float32)
# 正常化坐标 (零填充)占位
gt_boxes = KL.Lambda(lambda x: norm_boxes_graph(
x, K.shape(input_image)[1:3]))(input_gt_boxes)
# 3.GT口罩(零衬垫)
# 当 USE_MINI_MASK 是True时用的是 [56,56] 否 [512, 512]图片原大小 这个是最小的描边框
if config.USE_MINI_MASK:
input_gt_masks = KL.Input(
shape=[config.MINI_MASK_SHAPE[0],
config.MINI_MASK_SHAPE[1], None],
name="input_gt_masks", dtype=bool)
else:
input_gt_masks = KL.Input(
shape=[config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1], None],
name="input_gt_masks", dtype=bool)
接主:
如果是inference则执行这段代码
# 如果不是训练执行测试
elif mode == "inference":
# input_anchors 锚框,[batch, None, 4]
# Anchors in normalized coordinates
# 归一化坐标中的锚
input_anchors = KL.Input(shape=[None, 4], name="input_anchors")
接主:
ResNet网络,分别有ResNet50,ResNet101,ResNet152,ResNet200
BACKBONE就是用来选择网络的
下面是网络部分我不做网络详解
# Build the shared convolutional layers.
# 构建共享的卷积层。
# Bottom-up Layers
# 自底向上的层次
# Returns a list of the last layers of each stage, 5 in total.
# 返回每个阶段的最后一层的列表,总共5层。
# Don't create the thead (stage 5), so we pick the 4th item in the list.
# 深度残差网络ResNet,分别有50,101,152,200层
# 接下来进入网络 config.BACKBONE = resent101 的网络进行测试
_, C2, C3, C4, C5 = resnet_graph(input_image, config.BACKBONE,
stage5=True, train_bn=config.TRAIN_BN)
# Top-down Layers
# 自顶向下的层次
# TODO: add assert to varify feature map sizes match what's in config
# 添加断言来改变功能映射的大小,以匹配配置中的内容
P5 = KL.Conv2D(256, (1, 1), name='fpn_c5p5')(C5)
P4 = KL.Add(name="fpn_p4add")([
KL.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5),
KL.Conv2D(256, (1, 1), name='fpn_c4p4')(C4)])
P3 = KL.Add(name="fpn_p3add")([
KL.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4),
KL.Conv2D(256, (1, 1), name='fpn_c3p3')(C3)])
P2 = KL.Add(name="fpn_p2add")([
KL.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3),
KL.Conv2D(256, (1, 1), name='fpn_c2p2')(C2)])
# Attach 3x3 conv to all P layers to get the final feature maps.
# 将3x3 conv附加到所有P层上,得到最终的特征图。
P2 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p2")(P2)
P3 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p3")(P3)
P4 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p4")(P4)
P5 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p5")(P5)
# P6 is used for the 5th anchor scale in RPN. Generated by
# P6是RPN中第5个锚标。生成的
# subsampling from P5 with stride of 2.
# 步长为2的P5子采样。
P6 = KL.MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(P5)
# Note that P6 is used in RPN, but not in the classifier heads.
# 注意,P6在RPN中使用,但不在分类器头部中使用。
# 其中rpn_feature_maps对应图中的实线输出,送入RPN网络分类/回归得到锚框的前景/背景鉴别结果
# 而mrcnn_feature_maps则是后面进行ROI Align时的切割目标
rpn_feature_maps = [P2, P3, P4, P5, P6]
mrcnn_feature_maps = [P2, P3, P4, P5]
跳转 resnet_graph :
也是网络部分,看图就知道结构,我就不推网络过程了。
def resnet_graph(input_image, architecture, stage5=False, train_bn=True):
"""Build a ResNet graph.
构建一个ResNet图
architecture: Can be resnet50 or resnet101
可以是resnet50还是resnet101
stage5: Boolean. If False, stage5 of the network is not created
如果为False,则不创建网络的阶段5
train_bn: Boolean. Train or freeze Batch Norm layres
培训或冻结批次标准层
"""
assert architecture in ["resnet50", "resnet101"]
# Stage 1
x = KL.ZeroPadding2D((3, 3))(input_image)
x = KL.Conv2D(64, (7, 7), strides=(2, 2), name='conv1', use_bias=True)(x)
x = BatchNorm(name='bn_conv1')(x, training=train_bn)
x = KL.Activation('relu')(x)
C1 = x = KL.MaxPooling2D((3, 3), strides=(2, 2), padding="same")(x)
# Stage 2
x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1), train_bn=train_bn)
x = identity_block(x, 3, [64, 64, 256], stage=2, block='b', train_bn=train_bn)
C2 = x = identity_block(x, 3, [64, 64, 256], stage=2, block='c', train_bn=train_bn)
# Stage 3
x = conv_block(x, 3, [128, 128, 512], stage=3, block='a', train_bn=train_bn)
x = identity_block(x, 3, [128, 128, 512], stage=3, block='b', train_bn=train_bn)
x = identity_block(x, 3, [128, 128, 512], stage=3, block='c', train_bn=train_bn)
C3 = x = identity_block(x, 3, [128, 128, 512], stage=3, block='d', train_bn=train_bn)
# Stage 4
x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a', train_bn=train_bn)
block_count = {"resnet50": 5, "resnet101": 22}[architecture]
for i in range(block_count):
x = identity_block(x, 3, [256, 256, 1024], stage=4, block=chr(98 + i), train_bn=train_bn)
C4 = x
# Stage 5
if stage5:
x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a', train_bn=train_bn)
x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b', train_bn=train_bn)
C5 = x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c', train_bn=train_bn)
else:
C5 = None
return [C1, C2, C3, C4, C5]
回到主函数:
我们看到有这两个 rpn_feature_maps mrcnn_feature_maps
rpn_feature_maps把网络聚集在一起,对应这个:
mrcnn_feature_maps把网络聚集在一起,对应这个:
接主:
我们就以128*128图片大小为主,那么这里的anchors就是4092个
看一下参数:
GPU_COUNT GPU数量
IMAGES_PER_GPU 一个12GB的GPU通常可以处理2张1024x1024px的图像,就填2
BATCH_SIZE = IMAGES_PER_GPU * GPU_COUNT 1*2=2
第一个 anchors 是 (4092,4)
第二个 anchors 是 (2,4092,4) 复制了维度,2由BATCH_SIZE
else,就是inference的
if mode == "training":
anchors = self.get_anchors(config.IMAGE_SHAPE)
# np.broadcast_to 批量复制维度 因为Keras需要它
anchors = np.broadcast_to(anchors, (config.BATCH_SIZE,) + anchors.shape)
# A hack to get around Keras's bad support for constants
'''
keras的Module不能接收tf的Tensor作为数据流,所有需要使用KL.Lambda将之转化为keras的数据流,
如下这样将tf写好的函数输出直接转换为keras的Module可以接收的类型
'''
anchors = KL.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image)
else:
anchors = input_anchors
跳转函数 get_anchors
def get_anchors(self, image_shape):
"""返回给定图像大小的锚金字塔"""
backbone_shapes = compute_backbone_shapes(self.config, image_shape)
# 缓存锚和重用,如果图像形状相同
if not hasattr(self, "_anchor_cache"):
self._anchor_cache = {}
if not tuple(image_shape) in self._anchor_cache:
# 生成锚
a = utils.generate_pyramid_anchors(
self.config.RPN_ANCHOR_SCALES,
self.config.RPN_ANCHOR_RATIOS,
backbone_shapes,
self.config.BACKBONE_STRIDES,
self.config.RPN_ANCHOR_STRIDE)
self.anchors = a
# Normalize coordinates 标准化
self._anchor_cache[tuple(image_shape)] = utils.norm_boxes(a, image_shape[:2])
return self._anchor_cache[tuple(image_shape)]
跳转函数 generate_pyramid_anchors
参数:
RPN_ANCHOR_SCALES (32, 64, 128, 256, 512) 锚框大小
RPN_ANCHOR_RATIOS [0.5, 1, 2] 比率
[[256 256] [128 128] [ 64 64] [ 32 32] [ 16 16]]
BACKBONE_STRIDES [4, 8, 16, 32, 64]
RPN_ANCHOR_STRIDE 1
如果输入的是1024*1024的图片输出的anchor是21688个,输入128*128的图片输出是4092个anchor。
1024的图片大小决定了这个参数[[256 256] [128 128] [ 64 64] [ 32 32] [ 16 16]]也就是feature_shapes
def generate_pyramid_anchors(scales, ratios, feature_shapes, feature_strides,
anchor_stride):
# Anchors
# [anchor_count, (y1, x1, y2, x2)]
anchors = []
for i in range(len(scales)):
anchors.append(generate_anchors(scales[i], ratios, feature_shapes[i],
feature_strides[i], anchor_stride))
return np.concatenate(anchors, axis=0)
跳转函数 generate_anchors 这个函数就是详细的计算方法,可以看一下,很简单。
def generate_anchors(scales, ratios, shape, feature_stride, anchor_stride):
# Get all combinations of scales and ratios
scales, ratios = np.meshgrid(np.array(scales), np.array(ratios))
scales = scales.flatten()
ratios = ratios.flatten()
# Enumerate heights and widths from scales and ratios
heights = scales / np.sqrt(ratios)
widths = scales * np.sqrt(ratios)
# Enumerate shifts in feature space
shifts_y = np.arange(0, shape[0], anchor_stride) * feature_stride
shifts_x = np.arange(0, shape[1], anchor_stride) * feature_stride
shifts_x, shifts_y = np.meshgrid(shifts_x, shifts_y)
# Enumerate combinations of shifts, widths, and heights
box_widths, box_centers_x = np.meshgrid(widths, shifts_x)
box_heights, box_centers_y = np.meshgrid(heights, shifts_y)
# Reshape to get a list of (y, x) and a list of (h, w)
box_centers = np.stack(
[box_centers_y, box_centers_x], axis=2).reshape([-1, 2])
box_sizes = np.stack([box_heights, box_widths], axis=2).reshape([-1, 2])
# Convert to corner coordinates (y1, x1, y2, x2)
boxes = np.concatenate([box_centers - 0.5 * box_sizes,
box_centers + 0.5 * box_sizes], axis=1)
return boxes
接主:
下面是RPN部分,RPN网络我这里不详解,网上查查看。
对应的层级,输出对应的结果
'''
拿到三种值:
rpn_logits: [batch, H, W, 2] Anchor classifier logits (before softmax)
背景和前景的二分类的log
rpn_probs: [batch, W, W, 2] Anchor classifier probabilities.
背景和前景的二分类的prob
rpn_bbox: [batch, H, W, (dy, dx, log(dh), log(dw))] Deltas to beapplied to anchors.
回归结果锚框的位置
'''
rpn = build_rpn_model(config.RPN_ANCHOR_STRIDE, # 1 3 256
len(config.RPN_ANCHOR_RATIOS), 256)
# Loop through pyramid layers
layer_outputs = [] # list of lists
for p in rpn_feature_maps:
layer_outputs.append(rpn([p]))
# Concatenate layer outputs
# 连接层的输出
# Convert from list of lists of level outputs to list of lists
# 将级别输出列表列表转换为列表列表
# of outputs across levels.
# 跨级别的输出。
# e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"]
outputs = list(zip(*layer_outputs))
outputs = [KL.Concatenate(axis=1, name=n)(list(o))
for o, n in zip(outputs, output_names)]
# # 保存各pyramid特征经过RPN之后的结果
rpn_class_logits, rpn_class, rpn_bbox = outputs
跳转函数 build_rpn_mode
# 输入的值 1 3 256
def build_rpn_model(anchor_stride, anchors_per_location, depth):
input_feature_map = KL.Input(shape=[None, None, depth],
name="input_rpn_feature_map")
outputs = rpn_graph(input_feature_map, anchors_per_location, anchor_stride)
return KM.Model([input_feature_map], outputs, name="rpn_model")
进一步跳转 rpn_graph:
def rpn_graph(feature_map, anchors_per_location, anchor_stride):
# TODO: check if stride of 2 causes alignment issues if the featuremap
# is not even.
# Shared convolutional base of the RPN
# 开始 rpn 卷积
shared = KL.Conv2D(512, (3, 3), padding='same', activation='relu',
strides=anchor_stride,
name='rpn_conv_shared')(feature_map)
# Anchor Score. [batch, height, width, anchors per location * 2].
x = KL.Conv2D(2 * anchors_per_location, (1, 1), padding='valid',
activation='linear', name='rpn_class_raw')(shared)
# Reshape to [batch, anchors, 2]
rpn_class_logits = KL.Lambda(
lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 2]))(x)
# Softmax on last dimension of BG/FG.
rpn_probs = KL.Activation(
"softmax", name="rpn_class_xxx")(rpn_class_logits)
# Bounding box refinement. [batch, H, W, anchors per location, depth]
# where depth is [x, y, log(w), log(h)]
x = KL.Conv2D(anchors_per_location * 4, (1, 1), padding="valid",
activation='linear', name='rpn_bbox_pred')(shared)
# Reshape to [batch, anchors, 4]
rpn_bbox = KL.Lambda(lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 4]))(x)
return [rpn_class_logits, rpn_probs, rpn_bbox]
接主:
POST_NMS_ROIS_TRAINING = 2000 设定输出roi个数,这个是训练的
POST_NMS_ROIS_INFERENCE = 1000 设定输出roi个数,这个是inference的
RPN_NMS_THRESHOLD = 0.7 NMS
前面生成了4092个anchors
RPN网络输出rpn_box, rpn_class
进行结合最终输出rpn_roi是2000个
我们必须明白,RPN给出的是什么东西,为什么有利于我们筛选框。
RPN给出的是RPN_class区域得分和RPN_box区域建议。
# Generate proposals
# Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates
# and zero padded. 2000
proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training"\
else config.POST_NMS_ROIS_INFERENCE
# 4092 => 2000
rpn_rois = ProposalLayer(
proposal_count=proposal_count,
nms_threshold=config.RPN_NMS_THRESHOLD,
name="ROI",
config=config)([rpn_class, rpn_bbox, anchors])
跳转函数 ProposalLayer
RPN_ANCHOR_RATIOS = [0.5, 1, 2]
RPN_ANCHOR_STRIDE = 1
TOP_DOWN_PYRAMID_SIZE = 256 自顶向下层的大小
class ProposalLayer(KE.Layer):
"""
Inputs:
rpn_probs: [batch, anchors, (bg prob, fg prob)]
前景背景的二分类peob结果
rpn_bbox: [batch, anchors, (dy, dx, log(dh), log(dw))]
锚框偏移量
anchors: [batch, (y1, x1, y2, x2)] anchors in normalized coordinates
归一化坐标中的锚 216888
Returns:
Proposals in normalized coordinates [batch, rois, (y1, x1, y2, x2)]
标准化坐标下的提案 [batch, rois, (y1, x1, y2, x2)]
"""
def __init__(self, proposal_count, nms_threshold, config=None, **kwargs):
super(ProposalLayer, self).__init__(**kwargs)
self.config = config
self.proposal_count = proposal_count # 2000
self.nms_threshold = nms_threshold # 0.7
def call(self, inputs): # 不懂call方法还想看懂代码哈哈
# 盒子的分数。使用前台类置信度。
scores = inputs[0][:, :, 1] # 调用rpn_probs [batch, H * W * anchors_per_location, (bg prob, fg prob)]
# RPN_BBOX_STD_DEV [0.1 0.1 0.2 0.2] 标准差的操作
# 记录坐标修正信息:(dy, dx, log(dh), log(dw)) ==> [batch, num_rois, 4]
deltas = inputs[1] # 调用 rpn_bbox: [batch, anchors, (dy, dx, log(dh), log(dw))]
deltas = deltas * np.reshape(self.config.RPN_BBOX_STD_DEV, [1, 1, 4]) # 调整偏移量
# Anchors 记录坐标信息:(y1, x1, y2, x2). [batch, num_rois, 4]
anchors = inputs[2] # 调用 anchors: [batch, (y1, x1, y2, x2)] shape=(8, 4092, 4)
# 通过调整到顶部锚的评分来提高性能
# 然后我们获取前景得分最大的6000个候选框
# top_k锚框筛选
pre_nms_limit = tf.minimum(6000, tf.shape(anchors)[1])
# 输入矩阵时输出每一行的top_k. [batch, top_k]
ix = tf.nn.top_k(scores, pre_nms_limit, sorted=True,
name="top_anchors").indices
# 提取top_k锚框,我们同时对三个输入进行了提取
# batch_slice函数:
# # 将batch特征拆分为单张
# # 然后提取指定的张数
# # 使用单张特征处理函数处理,并合并(此时返回的第一维不是输入时的batch,而是上步指定的张数)
# IMAGES_PER_GPU=1
scores = utils.batch_slice([scores, ix], lambda x, y: tf.gather(x, y),
self.config.IMAGES_PER_GPU)
deltas = utils.batch_slice([deltas, ix], lambda x, y: tf.gather(x, y),
self.config.IMAGES_PER_GPU)
pre_nms_anchors = utils.batch_slice([anchors, ix], lambda a, x: tf.gather(a, x),
self.config.IMAGES_PER_GPU,
names=["pre_nms_anchors"])
# Apply deltas to anchors to get refined anchors.
# [batch, N, (y1, x1, y2, x2)]
'''
锚框坐标初调:
我们在RPN中获取了全部锚框的坐标回归结果,rpn_bbox:[batch, anchors,
(dy, dx, log(dh), log(dw))],2小节中我们将top k锚框的坐标信息以及top k的回归信息提取了出来,
现在我们将之合并(使用RPN回归的结果取修正top k锚框的坐标)
'''
boxes = utils.batch_slice([pre_nms_anchors, deltas],
lambda x, y: apply_box_deltas_graph(x, y), # 修正框
self.config.IMAGES_PER_GPU,
names=["refined_anchors"])
# Clip to image boundaries. Since we're in normalized coordinates,
# clip to 0..1 range. [batch, N, (y1, x1, y2, x2)]
'''
注意,我们的锚框坐标实际上是位于一个归一化了的图上,上一步的修正进行之后不再能够保证这一点,
所以我们需要切除锚框越界的的部分(即只保留锚框和[0,0,1,1]画布的交集)。
'''
window = np.array([0, 0, 1, 1], dtype=np.float32) # 弃掉超出边框的
boxes = utils.batch_slice(boxes,
lambda x: clip_boxes_graph(x, window), # 保留交集函数如下
self.config.IMAGES_PER_GPU,
names=["refined_anchors_clipped"])
# 对于小物体,我们跳过它。
# Non-max suppression
# 非极大值抑制
# 最后进行非极大值抑制,确保不会出现过于重复的推荐区域
def nms(boxes, scores):
"""
非极大值抑制子函数
boxes: [top_k, (y1, x1, y2, x2)]
scores: [top_k]
"""
# tf.image.non_max_suppression 删除掉那些与之前的选择的边框具有很高的IOU的边框
indices = tf.image.non_max_suppression(
boxes, scores, self.proposal_count, # 参数三为最大返回数目
self.nms_threshold, name="rpn_non_max_suppression")
# tf.gather 函数来从参数boxes来获取选中的边框
proposals = tf.gather(boxes, indices)
# Pad if needed 一旦返回数目不足, 填充(0,0,0,0)直到数目达标
# tf.maximum 返回 x 和 y 的最大值
padding = tf.maximum(self.proposal_count - tf.shape(proposals)[0], 0)
# tf.pad 在后面添加全0行
proposals = tf.pad(proposals, [(0, padding), (0, 0)])
return proposals
# 进行一下切片处理
proposals = utils.batch_slice([boxes, scores], nms,
self.config.IMAGES_PER_GPU)
return proposals
def compute_output_shape(self, input_shape):
return (None, self.proposal_count, 4)
上面我们得到2000个roi
接主:
if mode == "training":
# 类ID掩码,用于将数据集支持的类ID标记为图像
# 解析图片的详细信息
active_class_ids = KL.Lambda(
lambda x: parse_image_meta_graph(x)["active_class_ids"]
)(input_image_meta)
# True 跳过
if not config.USE_RPN_ROIS:
# 忽略预测的roi,使用提供的roi作为输入 2000
input_rois = KL.Input(shape=[config.POST_NMS_ROIS_TRAINING, 4],
name="input_roi", dtype=np.int32)
# Normalize coordinates
target_rois = KL.Lambda(lambda x: norm_boxes_graph(
x, K.shape(input_image)[1:3]))(input_rois)
else:
target_rois = rpn_rois
# 生成检测目标
# 提供建议的子样本,并为培训产生目标产出
# 注意,提案类id、gt_boxes和gt_mask为零
# padded. Equally, returned rois and targets are zero padded.
# 得到200个提议框33%是正的positive 其余的是 negative
'''
rois: [?, 32, (....)]
target_class_ids:类id
target_bbox: 锚框值
target_mask: 锚框掩码
'''
rois, target_class_ids, target_bbox, target_mask =\
DetectionTargetLayer(config, name="proposal_targets")([
target_rois, input_gt_class_ids, gt_boxes, input_gt_masks])
跳转函数 DetectionTargetLayer
target_rois 前面生成的2000个提议 input_gt_class_ids 输入的类别id gt_boxes 输入的box input_gt_masks 输入的mask
class DetectionTargetLayer(KE.Layer):
"""Subsamples proposals and generates target box refinement, class_ids,and masks for each.
子样本提议,并生成目标框细化,class_id,以及每个人的面具。
分别有锚框和面具,这里的锚框
Inputs:
proposals: [batch, N, (y1, x1, y2, x2)] in normalized coordinates. Might be zero padded if there are not enough proposals.
如果没有足够的建议,就用零填充。
gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs. [?, 100]
gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized coordinates. [?, 100, (...)]
gt_masks: [batch, height, width, MAX_GT_INSTANCES] of boolean type [?, ., ., 100]
Returns: Target ROIs and corresponding class IDs, bounding box shifts,and masks.
目标roi和对应的类id,边界框移位,和面具
rois: [batch, TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized coordinates [?, 32, (....)]
target_class_ids: [batch, TRAIN_ROIS_PER_IMAGE]. Integer class IDs. [?, 32]
target_deltas: [batch, TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (dy, dx, log(dh), log(dw), class_id)]
Class-specific bbox refinements. bbox改进 [?, 32, 4, (....), class_id]
target_mask: [batch, TRAIN_ROIS_PER_IMAGE, height, width)
Masks cropped to bbox boundaries and resized to neural network output size.
掩模裁剪到bbox边界并调整为神经网络网络输出尺寸 [?, 32, ]
Note: Returned arrays might be zero padded if not enough target ROIs.
如果没有足够的目标roi,返回的数组可能填充为零
"""
def __init__(self, config, **kwargs):
super(DetectionTargetLayer, self).__init__(**kwargs)
self.config = config
def call(self, inputs):
proposals = inputs[0]
gt_class_ids = inputs[1]
gt_boxes = inputs[2]
gt_masks = inputs[3]
# Slice the batch and run a graph for each slice
# 将批处理切片,并为每个切片运行一个图
# TODO: Rename target_bbox to target_deltas for clarity
# 为了清晰起见,将target_bbox重命名为target_delta
names = ["rois", "target_class_ids", "target_bbox", "target_mask"]
outputs = utils.batch_slice(
[proposals, gt_class_ids, gt_boxes, gt_masks],
lambda w, x, y, z: detection_targets_graph(
w, x, y, z, self.config),
self.config.IMAGES_PER_GPU, names=names)
return outputs
def compute_output_shape(self, input_shape):
return [
(None, self.config.TRAIN_ROIS_PER_IMAGE, 4), # rois [?, 32, 4]
(None, 1), # class_ids
(None, self.config.TRAIN_ROIS_PER_IMAGE, 4), # deltas [?, 32, 4]
(None, self.config.TRAIN_ROIS_PER_IMAGE, self.config.MASK_SHAPE[0],
self.config.MASK_SHAPE[1]) # masks
]
def compute_mask(self, inputs, mask=None):
return [None, None, None, None]
跳转函数 detection_targets_graph
我并没有详细去看这些,排除一些一个锚框框多个物体的,收集正样本和负样本,最终留下200个
def detection_targets_graph(proposals, gt_class_ids, gt_boxes, gt_masks, config):
"""Generates detection targets for one image. Subsamples proposals and
generates target class IDs, bounding box deltas, and masks for each.
为一个图像生成检测目标。次级样本建议和
为每个类生成目标类id、包围框增量和掩码。
Inputs:
proposals: [N, (y1, x1, y2, x2)] in normalized coordinates. Might
be zero padded if there are not enough proposals.
gt_class_ids: [MAX_GT_INSTANCES] int class IDs
gt_boxes: [MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized coordinates.
gt_masks: [height, width, MAX_GT_INSTANCES] of boolean type.
Returns: Target ROIs and corresponding class IDs, bounding box shifts,
and masks.
目标roi和对应的类id,边界框移位,
和面具。 200
rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized coordinates 在规范化的坐标系
class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs. Zero padded. 整数类id。零填充。
deltas: [TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (dy, dx, log(dh), log(dw))]
Class-specific bbox refinements.
masks: [TRAIN_ROIS_PER_IMAGE, height, width). Masks cropped to bbox
boundaries and resized to neural network output size.
Note: Returned arrays might be zero padded if not enough target ROIs.
如果没有足够的目标roi,返回的数组可能填充为零。
"""
# Assertions
asserts = [
tf.Assert(tf.greater(tf.shape(proposals)[0], 0), [proposals],
name="roi_assertion"),
]
with tf.control_dependencies(asserts):
proposals = tf.identity(proposals)
# Remove zero padding
# 删除补零 前面用到的padding删除
proposals, _ = trim_zeros_graph(proposals, name="trim_proposals")
gt_boxes, non_zeros = trim_zeros_graph(gt_boxes, name="trim_gt_boxes")
gt_class_ids = tf.boolean_mask(gt_class_ids, non_zeros,
name="trim_gt_class_ids")
gt_masks = tf.gather(gt_masks, tf.where(non_zeros)[:, 0], axis=2,
name="trim_gt_masks")
# Handle COCO crowds
# A crowd box in COCO is a bounding box around several instances. Exclude
# COCO中的群众框是围绕多个实例的包围框。排除 标注多个物体的去除
# them from training. A crowd box is given a negative class ID.
# 他们从培训。人群框被赋予一个负面的类ID。
crowd_ix = tf.where(gt_class_ids < 0)[:, 0]
non_crowd_ix = tf.where(gt_class_ids > 0)[:, 0]
crowd_boxes = tf.gather(gt_boxes, crowd_ix)
crowd_masks = tf.gather(gt_masks, crowd_ix, axis=2)
gt_class_ids = tf.gather(gt_class_ids, non_crowd_ix)
gt_boxes = tf.gather(gt_boxes, non_crowd_ix)
gt_masks = tf.gather(gt_masks, non_crowd_ix, axis=2)
# Compute overlaps matrix [proposals, gt_boxes]
# 计算重叠矩阵[建议书,gt_boxes] iou
overlaps = overlaps_graph(proposals, gt_boxes)
# Compute overlaps with crowd boxes [anchors, crowds]
# 计算与人群框重叠[锚,人群]
crowd_overlaps = overlaps_graph(proposals, crowd_boxes)
crowd_iou_max = tf.reduce_max(crowd_overlaps, axis=1)
no_crowd_bool = (crowd_iou_max < 0.001)
# Determine postive and negative ROIs
# 确定正的和负的roi
roi_iou_max = tf.reduce_max(overlaps, axis=1)
# 1. Positive ROIs are those with >= 0.5 IoU with a GT box
# 正roi是那些具有>= 0.5 IoU和GT框的roi
positive_roi_bool = (roi_iou_max >= 0.5)
positive_indices = tf.where(positive_roi_bool)[:, 0]
# 2. Negative ROIs are those with < 0.5 with every GT box. Skip crowds.
# 负roi是指每个GT盒子的roi都小于0.5。跳过人群。 负样本
negative_indices = tf.where(tf.logical_and(roi_iou_max < 0.5, no_crowd_bool))[:, 0]
# Subsample ROIs. Aim for 33% positive
# Positive ROIs
# 子样品roi。目标为33%阳性
# #积极roi
positive_count = int(config.TRAIN_ROIS_PER_IMAGE * # 200 * 0.33
config.ROI_POSITIVE_RATIO)
positive_indices = tf.random_shuffle(positive_indices)[:positive_count]
positive_count = tf.shape(positive_indices)[0]
# Negative ROIs. Add enough to maintain positive:negative ratio.
# 消极的roi。添加足够的量来保持正:负的比例。
r = 1.0 / config.ROI_POSITIVE_RATIO
negative_count = tf.cast(r * tf.cast(positive_count, tf.float32), tf.int32) - positive_count
negative_indices = tf.random_shuffle(negative_indices)[:negative_count]
# Gather selected ROIs
# 收集选定roi
positive_rois = tf.gather(proposals, positive_indices)
negative_rois = tf.gather(proposals, negative_indices)
# Assign positive ROIs to GT boxes.
# 为GT盒子分配正的roi。
positive_overlaps = tf.gather(overlaps, positive_indices)
roi_gt_box_assignment = tf.argmax(positive_overlaps, axis=1)
roi_gt_boxes = tf.gather(gt_boxes, roi_gt_box_assignment)
roi_gt_class_ids = tf.gather(gt_class_ids, roi_gt_box_assignment)
# Compute bbox refinement for positive ROIs
# 计算正roi的bbox细化
deltas = utils.box_refinement_graph(positive_rois, roi_gt_boxes)
deltas /= config.BBOX_STD_DEV
# Assign positive ROIs to GT masks
# Permute masks to [N, height, width, 1]
# 为GT遮罩分配正的roi
transposed_masks = tf.expand_dims(tf.transpose(gt_masks, [2, 0, 1]), -1)
# Pick the right mask for each ROI
# 为每个ROI选择正确的掩码
roi_masks = tf.gather(transposed_masks, roi_gt_box_assignment)
# Compute mask targets
# 计算面具的目标
boxes = positive_rois
if config.USE_MINI_MASK:
# Transform ROI corrdinates from normalized image space
# to normalized mini-mask space.
y1, x1, y2, x2 = tf.split(positive_rois, 4, axis=1)
gt_y1, gt_x1, gt_y2, gt_x2 = tf.split(roi_gt_boxes, 4, axis=1)
gt_h = gt_y2 - gt_y1
gt_w = gt_x2 - gt_x1
y1 = (y1 - gt_y1) / gt_h
x1 = (x1 - gt_x1) / gt_w
y2 = (y2 - gt_y1) / gt_h
x2 = (x2 - gt_x1) / gt_w
boxes = tf.concat([y1, x1, y2, x2], 1)
box_ids = tf.range(0, tf.shape(roi_masks)[0])
masks = tf.image.crop_and_resize(tf.cast(roi_masks, tf.float32), boxes,
box_ids,
config.MASK_SHAPE)
# Remove the extra dimension from masks.
# 从蒙版中移除额外维度
masks = tf.squeeze(masks, axis=3)
# Threshold mask pixels at 0.5 to have GT masks be 0 or 1 to use with
# binary cross entropy loss.
# 阈值掩码像素为0.5时,GT掩码为0或1
# 二进制交叉熵损失。
masks = tf.round(masks)
# Append negative ROIs and pad bbox deltas and masks that
# are not used for negative ROIs with zeros.
# 添加负的roi和pad bbox增量和掩码
# 不用于带零的负roi。
rois = tf.concat([positive_rois, negative_rois], axis=0)
N = tf.shape(negative_rois)[0]
P = tf.maximum(config.TRAIN_ROIS_PER_IMAGE - tf.shape(rois)[0], 0)
rois = tf.pad(rois, [(0, P), (0, 0)])
roi_gt_boxes = tf.pad(roi_gt_boxes, [(0, N + P), (0, 0)])
roi_gt_class_ids = tf.pad(roi_gt_class_ids, [(0, N + P)])
deltas = tf.pad(deltas, [(0, N + P), (0, 0)])
masks = tf.pad(masks, [[0, N + P], (0, 0), (0, 0)])
return rois, roi_gt_class_ids, deltas, masks
跳转函数 fpn_classifier_graph
进入FPN
我们将用到,前面得到的200个roi
POOL_SIZE 7
NUM_CLASSES 2
TRAIN_BN True 是否冻结层
有一个很重要的 ROIAlign
最后返回两个:class, box
def fpn_classifier_graph(rois, feature_maps, image_meta,
pool_size, num_classes, train_bn=True):
"""Builds the computation graph of the feature pyramid network classifier
and regressor heads.
建立特征金字塔网络分类器的计算图和回归量
rois: [batch, num_rois, (y1, x1, y2, x2)] Proposal boxes in normalized
coordinates.
feature_maps: List of feature maps from diffent layers of the pyramid,
[P2, P3, P4, P5]. Each has a different resolution.
- image_meta: [batch, (meta data)] Image details. See compose_image_meta()
pool_size: The width of the square feature map generated from ROI Pooling.
num_classes: number of classes, which determines the depth of the results
train_bn: Boolean. Train or freeze Batch Norm layres
Returns:
logits: [N, NUM_CLASSES] classifier logits (before softmax)
probs: [N, NUM_CLASSES] classifier probabilities
bbox_deltas: [N, (dy, dx, log(dh), log(dw))] Deltas to apply to
proposal boxes
"""
# ROI Pooling
# Shape: [batch, num_boxes, pool_height, pool_width, channels]
x = PyramidROIAlign([pool_size, pool_size],
name="roi_align_classifier")([rois, image_meta] + feature_maps)
# Two 1024 FC layers (implemented with Conv2D for consistency)
x = KL.TimeDistributed(KL.Conv2D(1024, (pool_size, pool_size), padding="valid"),
name="mrcnn_class_conv1")(x)
x = KL.TimeDistributed(BatchNorm(), name='mrcnn_class_bn1')(x, training=train_bn)
x = KL.Activation('relu')(x)
x = KL.TimeDistributed(KL.Conv2D(1024, (1, 1)),
name="mrcnn_class_conv2")(x)
x = KL.TimeDistributed(BatchNorm(), name='mrcnn_class_bn2')(x, training=train_bn)
x = KL.Activation('relu')(x)
shared = KL.Lambda(lambda x: K.squeeze(K.squeeze(x, 3), 2),
name="pool_squeeze")(x)
# Classifier head
mrcnn_class_logits = KL.TimeDistributed(KL.Dense(num_classes),
name='mrcnn_class_logits')(shared)
mrcnn_probs = KL.TimeDistributed(KL.Activation("softmax"),
name="mrcnn_class")(mrcnn_class_logits)
# BBox head
# [batch, boxes, num_classes * (dy, dx, log(dh), log(dw))]
x = KL.TimeDistributed(KL.Dense(num_classes * 4, activation='linear'),
name='mrcnn_bbox_fc')(shared)
# Reshape to [batch, boxes, num_classes, (dy, dx, log(dh), log(dw))]
s = K.int_shape(x)
mrcnn_bbox = KL.Reshape((s[1], num_classes, 4), name="mrcnn_bbox")(x)
return mrcnn_class_logits, mrcnn_probs, mrcnn_bbox
跳转函数 build_fpn_mask_graph
返回mask
def build_fpn_mask_graph(rois, feature_maps, image_meta,
pool_size, num_classes, train_bn=True):
"""Builds the computation graph of the mask head of Feature Pyramid Network.
rois: [batch, num_rois, (y1, x1, y2, x2)] Proposal boxes in normalized
coordinates.
feature_maps: List of feature maps from diffent layers of the pyramid,
[P2, P3, P4, P5]. Each has a different resolution.
image_meta: [batch, (meta data)] Image details. See compose_image_meta()
pool_size: The width of the square feature map generated from ROI Pooling.
num_classes: number of classes, which determines the depth of the results
train_bn: Boolean. Train or freeze Batch Norm layres
Returns: Masks [batch, roi_count, height, width, num_classes]
"""
# ROI Pooling
# Shape: [batch, boxes, pool_height, pool_width, channels]
x = PyramidROIAlign([pool_size, pool_size],
name="roi_align_mask")([rois, image_meta] + feature_maps)
# Conv layers
x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
name="mrcnn_mask_conv1")(x)
x = KL.TimeDistributed(BatchNorm(),
name='mrcnn_mask_bn1')(x, training=train_bn)
x = KL.Activation('relu')(x)
x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
name="mrcnn_mask_conv2")(x)
x = KL.TimeDistributed(BatchNorm(),
name='mrcnn_mask_bn2')(x, training=train_bn)
x = KL.Activation('relu')(x)
x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
name="mrcnn_mask_conv3")(x)
x = KL.TimeDistributed(BatchNorm(),
name='mrcnn_mask_bn3')(x, training=train_bn)
x = KL.Activation('relu')(x)
x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
name="mrcnn_mask_conv4")(x)
x = KL.TimeDistributed(BatchNorm(),
name='mrcnn_mask_bn4')(x, training=train_bn)
x = KL.Activation('relu')(x)
x = KL.TimeDistributed(KL.Conv2DTranspose(256, (2, 2), strides=2, activation="relu"),
name="mrcnn_mask_deconv")(x)
x = KL.TimeDistributed(KL.Conv2D(num_classes, (1, 1), strides=1, activation="sigmoid"),
name="mrcnn_mask")(x)
return x
接主:
求loss
# TODO: clean up (use tf.identify if necessary)
output_rois = KL.Lambda(lambda x: x * 1, name="output_rois")(rois)
# Losses
'''
rpn_class_loss_graph: 用于求锚框的loss
input_rpn_match: [batch, anchors, 1]. Anchor match type. 1=positive, -1=negative, 0=neutral anchor.
rpn_class_logits: 锚框的
'''
rpn_class_loss = KL.Lambda(lambda x: rpn_class_loss_graph(*x), name="rpn_class_loss")(
[input_rpn_match, rpn_class_logits])
rpn_bbox_loss = KL.Lambda(lambda x: rpn_bbox_loss_graph(config, *x), name="rpn_bbox_loss")(
[input_rpn_bbox, input_rpn_match, rpn_bbox])
class_loss = KL.Lambda(lambda x: mrcnn_class_loss_graph(*x), name="mrcnn_class_loss")(
[target_class_ids, mrcnn_class_logits, active_class_ids])
bbox_loss = KL.Lambda(lambda x: mrcnn_bbox_loss_graph(*x), name="mrcnn_bbox_loss")(
[target_bbox, target_class_ids, mrcnn_bbox])
mask_loss = KL.Lambda(lambda x: mrcnn_mask_loss_graph(*x), name="mrcnn_mask_loss")(
[target_mask, target_class_ids, mrcnn_mask])
接下来是keras的套路
inputs = [input_image, input_image_meta,
input_rpn_match, input_rpn_bbox, input_gt_class_ids, input_gt_boxes, input_gt_masks]
if not config.USE_RPN_ROIS:
inputs.append(input_rois)
outputs = [rpn_class_logits, rpn_class, rpn_bbox,
mrcnn_class_logits, mrcnn_class, mrcnn_bbox, mrcnn_mask,
rpn_rois, output_rois,
rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss]
model = KM.Model(inputs, outputs, name='mask_rcnn')
else:
# Network Heads
# Proposal classifier and BBox regressor heads
# shape=(?, 1000, 4) shape=(?, 1000, 4) shape=(?, 1000, 1000, 4)
mrcnn_class_logits, mrcnn_class, mrcnn_bbox =\
fpn_classifier_graph(rpn_rois, mrcnn_feature_maps, input_image_meta,
config.POOL_SIZE, config.NUM_CLASSES,
train_bn=config.TRAIN_BN)
# Detections
# output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in
# normalized coordinates
# shape=(1, 100, 6)
# 获取分类提案框及其边界框delta和返回最终的检测框
detections = DetectionLayer(config, name="mrcnn_detection")(
[rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta])
# Create masks for detections
detection_boxes = KL.Lambda(lambda x: x[..., :4])(detections)
mrcnn_mask = build_fpn_mask_graph(detection_boxes, mrcnn_feature_maps,
input_image_meta,
config.MASK_POOL_SIZE,
config.NUM_CLASSES,
train_bn=config.TRAIN_BN)
model = KM.Model([input_image, input_image_meta, input_anchors],
[detections, mrcnn_class, mrcnn_bbox,
mrcnn_mask, rpn_rois, rpn_class, rpn_bbox],
name='mask_rcnn')
# Add multi-GPU support.
if config.GPU_COUNT > 1:
from mrcnn.parallel_model import ParallelModel
model = ParallelModel(model, config.GPU_COUNT)
return model
结束