define a new FasterRCNNFeatureExtractor andpass it to our FasterRCNNMetaArch constructor asinput.
定义一个FasterRCNNFeatureExtractor,并且把它作为输入到 FasterRCNNMetaArch constructor 。
A FasterRCNNFeatureExtractor must define a few functions
preprocess: Run any preprocessing of input values that is necessary prior to running the detector on an input image.
_extract_proposal_features: Extract first stage Region Proposal Network (RPN) features.提取第一阶段区域建议网络(RPN)特征。
_extract_box_classifier_features: Extract second stage Box Classifier features.提取第二阶段盒分类器特征。
restore_from_classification_checkpoint_fn: Load a checkpoint into the TensorFlow graph.
1、FasterRCNNFeatureExtractor
from abc import abstractmethod
from functools import partial
import tensorflow as tf
from object_detection.anchor_generators import grid_anchor_generator
from object_detection.core import balanced_positive_negative_sampler as sampler
from object_detection.core import box_list
from object_detection.core import box_list_ops
from object_detection.core import box_predictor
from object_detection.core import losses
from object_detection.core import model
from object_detection.core import post_processing
from object_detection.core import standard_fields as fields
from object_detection.core import target_assigner
from object_detection.utils import ops
from object_detection.utils import shape_utils
slim = tf.contrib.slim
classFasterRCNNFeatureExtractor(object):"""Faster R-CNN Feature Extractor definition."""def__init__(self,
is_training,
first_stage_features_stride,
batch_norm_trainable=False,
reuse_weights=None,
weight_decay=0.0):"""Constructor.
Args:
is_training: A boolean indicating whether the training version of the
computation graph should be constructed.
first_stage_features_stride: Output stride of extracted RPN feature map.(提取的RPN feature map输出stride。)
batch_norm_trainable: Whether to update batch norm parameters during
training or not. When training with a relative large batch size
(e.g. 8), it could be desirable to enable batch norm update.批归一化训练:是否在训练期间更新批模参数。在使用较大批处理进行训练时==8,
启用批处理规范更新可能是可取的。
reuse_weights: Whether to reuse variables. Default is None.
weight_decay: float weight decay for feature extractor (default: 0.0).
"""
self._is_training = is_training
self._first_stage_features_stride = first_stage_features_stride
self._train_batch_norm =(batch_norm_trainable and is_training)
self._reuse_weights = reuse_weights
self._weight_decay = weight_decay #权重衰减
@abstractmethod
defpreprocess(self, resized_inputs):"""Feature-extractor specific preprocessing (minus image resizing)."""passdefextract_proposal_features(self, preprocessed_inputs, scope):"""Extracts first stage RPN features.提取第一阶段RPN特征
.该函数负责从预处理的图像中提取特征图。这些特征被区域提案网络(RPN)用来预测提案。
Args:
preprocessed_inputs: A [batch, height, width, channels] float tensor
representing a batch of images.
scope: A scope name.
Returns:
rpn_feature_map: A tensor with shape [batch, height, width, depth] #rpn_feature_map是经过了Resnet101+FPN后结果
activations: A dictionary mapping activation tensor names to tensors.
"""with tf.variable_scope(scope, values=[preprocessed_inputs]):return self._extract_proposal_features(preprocessed_inputs, scope)
@abstractmethod
def_extract_proposal_features(self, preprocessed_inputs, scope):"""Extracts first stage RPN features, to be overridden."""passdefextract_box_classifier_features(self, proposal_feature_maps, scope):"""Extracts second stage box classifier features.提取第二阶段盒分类器特征,
Args:
proposal_feature_maps: A 4-D float tensor with shape
[batch_size * self.max_num_proposals, crop_height, crop_width, depth]
representing the feature map cropped to each proposal.代表裁剪到每个候选框的feature map
scope: A scope name.候选特征图
Returns:
proposal_classifier_features: A 4-D float tensor with shape
[batch_size * self.max_num_proposals, height, width, depth]
representing box classifier features for each proposal.
"""with tf.variable_scope(
scope, values=[proposal_feature_maps], reuse=tf.AUTO_REUSE):return self._extract_box_classifier_features(proposal_feature_maps, scope)
@abstractmethod
def_extract_box_classifier_features(self, proposal_feature_maps, scope):"""Extracts second stage box classifier features, to be overridden."""passdefrestore_from_classification_checkpoint_fn(
self,
first_stage_feature_extractor_scope,
second_stage_feature_extractor_scope):"""Returns a map of variables to load from a foreign checkpoint.返回要从外部检查点加载的变量的映射
Args:
first_stage_feature_extractor_scope: A scope name for the first stage
feature extractor.第一阶段特性提取器作用域:第一阶段的作用域名称
second_stage_feature_extractor_scope: A scope name for the second stage
feature extractor.
Returns:
A dict mapping variable names (to load from a checkpoint) to variables in
the model graph.
"""
variables_to_restore ={}for variable in tf.global_variables():for scope_name in[first_stage_feature_extractor_scope,
second_stage_feature_extractor_scope]:if variable.op.name.startswith(scope_name):
var_name = variable.op.name.replace(scope_name +'/','')
variables_to_restore[var_name]= variable
return variables_to_restore
2、FasterRCNNMetaArch
predict
defpredict(self, preprocessed_inputs, true_image_shapes):"""
If `number_of_stages` is 1, this function only returns first stage
RPN predictions (un-postprocessed). Otherwise it returns both
first stage RPN predictions as well as second stage box classifier
predictions.如果阶段数为1,该函数只返回第一阶段RPN预测(未经过后处理)。否则,它将返回第一阶段RPN预测以及第二阶段box分类器预测。
Other remarks:
+ Anchor pruning vs. clipping(锚点的剪切与裁剪): following the recommendation of the Faste
在训练阶段修建窗口外的锚点,测试时,裁剪锚点到图像窗口
Args:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is
of the form [height, width, channels] indicating the shapes
of true images in the resized images, as resized images can be padded
with zeros.
Returns:
prediction_dict: a dictionary holding "raw" prediction tensors:
1) rpn_box_predictor_features: A 4-D float32 tensor with shape
[batch_size, height, width, depth] to be used for predicting proposal
boxes and corresponding objectness scores.
2) rpn_features_to_crop: A 4-D float32 tensor with shape
[batch_size, height, width, depth] representing image features to crop
using the proposal boxes predicted by the RPN.
3) image_shape: a 1-D tensor of shape [4] representing the input
image shape.
4) rpn_box_encodings: 3-D float tensor of shape
[batch_size, num_anchors, self._box_coder.code_size] containing
predicted boxes.
5) rpn_objectness_predictions_with_background: 3-D float tensor of shape
[batch_size, num_anchors, 2] containing class
predictions (logits) for each of the anchors. Note that this
tensor *includes* background class predictions (at class index 0).
6) anchors: A 2-D tensor of shape [num_anchors, 4] representing anchors
for the first stage RPN (in absolute coordinates). Note that
`num_anchors` can differ depending on whether the model is created in
training or inference mode.
(and if number_of_stages > 1):
7) refined_box_encodings: a 3-D tensor with shape
[total_num_proposals, num_classes, self._box_coder.code_size]
representing predicted (final) refined box encodings, where
total_num_proposals=batch_size*self._max_num_proposals. If using
a shared box across classes the shape will instead be
[total_num_proposals, 1, self._box_coder.code_size].
8) class_predictions_with_background: a 3-D tensor with shape
[total_num_proposals, num_classes + 1] containing class
predictions (logits) for each of the anchors, where
total_num_proposals=batch_size*self._max_num_proposals.
Note that this tensor *includes* background class predictions
(at class index 0).
9) num_proposals: An int32 tensor of shape [batch_size] representing the
number of proposals generated by the RPN. `num_proposals` allows us
to keep track of which entries are to be treated as zero paddings and
which are not since we always pad the number of proposals to be
`self.max_num_proposals` for each image.
10) proposal_boxes: A float32 tensor of shape
[batch_size, self.max_num_proposals, 4] representing
decoded proposal bounding boxes in absolute coordinates.
11) mask_predictions: (optional) a 4-D tensor with shape
[total_num_padded_proposals, num_classes, mask_height, mask_width]
containing instance mask predictions.
Raises:
ValueError: If `predict` is called before `preprocess`.
"""(rpn_box_predictor_features, rpn_features_to_crop, anchors_boxlist,
image_shape)= self._extract_rpn_feature_maps(preprocessed_inputs)#对处理后的数据进行运算(rpn_box_encodings, rpn_objectness_predictions_with_background
)= self._predict_rpn_proposals(rpn_box_predictor_features)#输入为新卷积层的输出,然后分为双路卷积,进行边框的回归和分类# the image window at training time and clipping at inference time.
clip_window = tf.to_float(tf.stack([0,0, image_shape[1], image_shape[2]]))if self._is_training:(rpn_box_encodings, rpn_objectness_predictions_with_background,
anchors_boxlist)= self._remove_invalid_anchors_and_predictions(
rpn_box_encodings, rpn_objectness_predictions_with_background,
anchors_boxlist, clip_window)#判断是否在进行训练,并且移除落在图像外面得锚else:
anchors_boxlist = box_list_ops.clip_to_window(
anchors_boxlist, clip_window)
self._anchors = anchors_boxlist
prediction_dict ={'rpn_box_predictor_features': rpn_box_predictor_features,'rpn_features_to_crop': rpn_features_to_crop,'image_shape': image_shape,'rpn_box_encodings': rpn_box_encodings,'rpn_objectness_predictions_with_background':
rpn_objectness_predictions_with_background,'anchors': self._anchors.get()}if self._number_of_stages >=2:
prediction_dict.update(self._predict_second_stage(
rpn_box_encodings,
rpn_objectness_predictions_with_background,
rpn_features_to_crop,
self._anchors.get(), image_shape, true_image_shapes))if self._number_of_stages ==3:
prediction_dict = self._predict_third_stage(
prediction_dict, true_image_shapes)return prediction_dict
_extract_rpn_feature_maps
def_extract_rpn_feature_maps(self, preprocessed_inputs):"""Extracts RPN features.提取RPN特征
This function extracts two feature maps: a feature map to be directly
fed to a box predictor (to predict location and objectness scores for
proposals) and a feature map from which to crop regions which will then
be sent to the second stage box classifier.该函数提取两幅特征图:一幅特征图将直接输入盒预测器(用于预测位置和提案的客观得分),另一幅特征图将从该特征图发送到作物区域,然后将发送到第二阶段的盒分类器。
Args:
preprocessed_inputs: a [batch, height, width, channels] image tensor.
Returns:
rpn_box_predictor_features: A 4-D float32 tensor with shape
[batch, height, width, depth] to be used for predicting proposal boxes
and corresponding objectness scores.
rpn_features_to_crop: A 4-D float32 tensor with shape
[batch, height, width, depth] representing image features to crop using
the proposals boxes.
anchors: A BoxList representing anchors (for the RPN) in
absolute coordinates.
image_shape: A 1-D tensor representing the input image shape.
"""
image_shape = tf.shape(preprocessed_inputs)#图处理数据图片的尺寸
rpn_features_to_crop, _ = self._feature_extractor.extract_proposal_features(
preprocessed_inputs, scope=self.first_stage_feature_extractor_scope)#提取候选区域的特征图(ResNet+fpn的结果),取block3的输出。调用的是上一个模块的候选区域的特征图(RPN网络)
feature_map_shape = tf.shape(rpn_features_to_crop)
anchors = box_list_ops.concatenate(
self._first_stage_anchor_generator.generate([(feature_map_shape[1],
feature_map_shape[2])]))with slim.arg_scope(self._first_stage_box_predictor_arg_scope_fn()):
kernel_size = self._first_stage_box_predictor_kernel_size
rpn_box_predictor_features = slim.conv2d(
rpn_features_to_crop,
self._first_stage_box_predictor_depth,
kernel_size=[kernel_size, kernel_size],
rate=self._first_stage_atrous_rate,
activation_fn=tf.nn.relu6)#对blocks3得输出结果再进行卷积操作return(rpn_box_predictor_features, rpn_features_to_crop,
anchors, image_shape)
_predict_rpn_proposals
def_predict_rpn_proposals(self, rpn_box_predictor_features):"""添加框到RPN特征图提取候选区域
Args:
rpn_box_predictor_features: A 4-D float32 tensor with shape
[batch, height, width, depth] to be used for predicting proposal boxes
and corresponding objectness scores.
Returns:
box_encodings: 3-D float tensor of shape
[batch_size, num_anchors, self._box_coder.code_size] containing
predicted boxes.
objectness_predictions_with_background: 3-D float tensor of shape
[batch_size, num_anchors, 2] containing class
predictions (logits) for each of the anchors. Note that this
tensor *includes* background class predictions (at class index 0).
"""
num_anchors_per_location =(
self._first_stage_anchor_generator.num_anchors_per_location())iflen(num_anchors_per_location)!=1:raise RuntimeError('anchor_generator is expected to generate anchors ''corresponding to a single feature map.')
box_predictions = self._first_stage_box_predictor.predict([rpn_box_predictor_features],
num_anchors_per_location,
scope=self.first_stage_box_predictor_scope)
box_encodings = tf.concat(
box_predictions[box_predictor.BOX_ENCODINGS], axis=1)
objectness_predictions_with_background = tf.concat(
box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
axis=1)return(tf.squeeze(box_encodings, axis=2),
objectness_predictions_with_background)
_predict_second_stage
def_predict_second_stage(self, rpn_box_encodings,
rpn_objectness_predictions_with_background,
rpn_features_to_crop,
anchors,
image_shape,
true_image_shapes):"""Predicts the output tensors from second stage of Faster R-CNN.
Args:
rpn_box_encodings: 4-D float tensor of shape
[batch_size, num_valid_anchors, self._box_coder.code_size] containing
predicted boxes.
rpn_objectness_predictions_with_background: 2-D float tensor of shape
[batch_size, num_valid_anchors, 2] containing class
predictions (logits) for each of the anchors. Note that this
tensor *includes* background class predictions (at class index 0).
rpn_features_to_crop: A 4-D float32 tensor with shape
[batch_size, height, width, depth] representing image features to crop
using the proposal boxes predicted by the RPN.
anchors: 2-D float tensor of shape
[num_anchors, self._box_coder.code_size].
image_shape: A 1D int32 tensors of size [4] containing the image shape.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is
of the form [height, width, channels] indicating the shapes
of true images in the resized images, as resized images can be padded
with zeros.
Returns:
prediction_dict: a dictionary holding "raw" prediction tensors:
1) refined_box_encodings: a 3-D tensor with shape
[total_num_proposals, num_classes, self._box_coder.code_size]
representing predicted (final) refined box encodings, where
total_num_proposals=batch_size*self._max_num_proposals. If using a
shared box across classes the shape will instead be
[total_num_proposals, 1, self._box_coder.code_size].
2) class_predictions_with_background: a 3-D tensor with shape
[total_num_proposals, num_classes + 1] containing class
predictions (logits) for each of the anchors, where
total_num_proposals=batch_size*self._max_num_proposals.
Note that this tensor *includes* background class predictions
(at class index 0).
3) num_proposals: An int32 tensor of shape [batch_size] representing the
number of proposals generated by the RPN. `num_proposals` allows us
to keep track of which entries are to be treated as zero paddings and
which are not since we always pad the number of proposals to be
`self.max_num_proposals` for each image.
4) proposal_boxes: A float32 tensor of shape
[batch_size, self.max_num_proposals, 4] representing
decoded proposal bounding boxes in absolute coordinates.
5) proposal_boxes_normalized: A float32 tensor of shape
[batch_size, self.max_num_proposals, 4] representing decoded proposal
bounding boxes in normalized coordinates. Can be used to override the
boxes proposed by the RPN, thus enabling one to extract features and
get box classification and prediction for externally selected areas
of the image.
6) box_classifier_features: a 4-D float32 tensor representing the
features for each proposal.
"""
image_shape_2d = self._image_batch_shape_2d(image_shape)
proposal_boxes_normalized, _, num_proposals = self._postprocess_rpn(
rpn_box_encodings, rpn_objectness_predictions_with_background,
anchors, image_shape_2d, true_image_shapes)#进行后处理
flattened_proposal_feature_maps =(
self._compute_second_stage_input_feature_maps(
rpn_features_to_crop, proposal_boxes_normalized))
box_classifier_features =(
self._feature_extractor.extract_box_classifier_features(
flattened_proposal_feature_maps,
scope=self.second_stage_feature_extractor_scope))
box_predictions = self._mask_rcnn_box_predictor.predict([box_classifier_features],
num_predictions_per_location=[1],
scope=self.second_stage_box_predictor_scope,
predict_boxes_and_classes=True)
refined_box_encodings = tf.squeeze(
box_predictions[box_predictor.BOX_ENCODINGS],
axis=1, name='all_refined_box_encodings')
class_predictions_with_background = tf.squeeze(
box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
axis=1, name='all_class_predictions_with_background')
absolute_proposal_boxes = ops.normalized_to_image_coordinates(
proposal_boxes_normalized, image_shape, self._parallel_iterations)
prediction_dict ={'refined_box_encodings': refined_box_encodings,'class_predictions_with_background':
class_predictions_with_background,'num_proposals': num_proposals,'proposal_boxes': absolute_proposal_boxes,'box_classifier_features': box_classifier_features,'proposal_boxes_normalized': proposal_boxes_normalized,}return prediction_dict
_predict_third_stage
def_predict_third_stage(self, prediction_dict, image_shapes):"""Predicts non-box, non-class outputs using refined detections.
For training, masks as predicted directly on the box_classifier_features,
which are region-features from the initial anchor boxes.
For inference, this happens after calling the post-processing stage, such
that masks are only calculated for the top scored boxes.
Args:
prediction_dict: a dictionary holding "raw" prediction tensors:
1) refined_box_encodings: a 3-D tensor with shape
[total_num_proposals, num_classes, self._box_coder.code_size]
representing predicted (final) refined box encodings, where
total_num_proposals=batch_size*self._max_num_proposals. If using a
shared box across classes the shape will instead be
[total_num_proposals, 1, self._box_coder.code_size].
2) class_predictions_with_background: a 3-D tensor with shape
[total_num_proposals, num_classes + 1] containing class
predictions (logits) for each of the anchors, where
total_num_proposals=batch_size*self._max_num_proposals.
Note that this tensor *includes* background class predictions
(at class index 0).
3) num_proposals: An int32 tensor of shape [batch_size] representing the
number of proposals generated by the RPN. `num_proposals` allows us
to keep track of which entries are to be treated as zero paddings and
which are not since we always pad the number of proposals to be
`self.max_num_proposals` for each image.
4) proposal_boxes: A float32 tensor of shape
[batch_size, self.max_num_proposals, 4] representing
decoded proposal bounding boxes in absolute coordinates.
5) box_classifier_features: a 4-D float32 tensor representing the
features for each proposal.
image_shapes: A 2-D int32 tensors of shape [batch_size, 3] containing
shapes of images in the batch.
Returns:
prediction_dict: a dictionary that in addition to the input predictions
does hold the following predictions as well:
1) mask_predictions: a 4-D tensor with shape
[batch_size, max_detection, mask_height, mask_width] containing
instance mask predictions.
"""if self._is_training:
curr_box_classifier_features = prediction_dict['box_classifier_features']
detection_classes = prediction_dict['class_predictions_with_background']
mask_predictions = self._mask_rcnn_box_predictor.predict([curr_box_classifier_features],
num_predictions_per_location=[1],
scope=self.second_stage_box_predictor_scope,
predict_boxes_and_classes=False,
predict_auxiliary_outputs=True)
prediction_dict['mask_predictions']= tf.squeeze(mask_predictions[
box_predictor.MASK_PREDICTIONS], axis=1)else:
detections_dict = self._postprocess_box_classifier(
prediction_dict['refined_box_encodings'],
prediction_dict['class_predictions_with_background'],
prediction_dict['proposal_boxes'],
prediction_dict['num_proposals'],
image_shapes)
prediction_dict.update(detections_dict)
detection_boxes = detections_dict[
fields.DetectionResultFields.detection_boxes]
detection_classes = detections_dict[
fields.DetectionResultFields.detection_classes]
rpn_features_to_crop = prediction_dict['rpn_features_to_crop']
batch_size = tf.shape(detection_boxes)[0]
max_detection = tf.shape(detection_boxes)[1]
flattened_detected_feature_maps =(
self._compute_second_stage_input_feature_maps(
rpn_features_to_crop, detection_boxes))
curr_box_classifier_features =(
self._feature_extractor.extract_box_classifier_features(
flattened_detected_feature_maps,
scope=self.second_stage_feature_extractor_scope))
mask_predictions = self._mask_rcnn_box_predictor.predict([curr_box_classifier_features],
num_predictions_per_location=[1],
scope=self.second_stage_box_predictor_scope,
predict_boxes_and_classes=False,
predict_auxiliary_outputs=True)
detection_masks = tf.squeeze(mask_predictions[
box_predictor.MASK_PREDICTIONS], axis=1)
_, num_classes, mask_height, mask_width =(
detection_masks.get_shape().as_list())
_, max_detection = detection_classes.get_shape().as_list()if num_classes >1:
detection_masks = self._gather_instance_masks(
detection_masks, detection_classes)
prediction_dict[fields.DetectionResultFields.detection_masks]=(
tf.reshape(detection_masks,[batch_size, max_detection, mask_height, mask_width]))return prediction_dict