AVOD_源码记录
Table of Contents
AVOD代码框架
主要分为以下几个部分:
- 预生成数据
- Train
- Evaluate+Infer
代码细节
预生成数据
用于生成rpn网络的输入数据:包含类聚类的anchor大小信息以及具体每个sample的anchor的生成的anchor信息
调用链
base_dir = avod/
config = avod/avod/configs/mb_preprocessing/rpn_cars(cyclists,pedestrians,people).config
主要的相关模块调用:
scripts/preprocessing/gen_min_batches.py->avod/builders/dataset_builder.py(build_kitti_dataset)->avod/datasets/kitti/kitti_dataset.py(KittiDataset)->avod/datasets/kitti/kitti_utils.py(KittiUtils)->avod/core/mini_batch_utils.py(MiniBatchUtils.preprocess_rpn_mini_batches)->avod/core/mini_batch_preprocessor.py(MiniBatchPreprocessor.preprocess->avod/core/anchor_generator/grid_anchor_3d_generator.py(GridAnchor3dGenerator.generate)
核心部分
-
数据前处理:mini_batch Anchor生成
Avod数据前处理gen_minbacth包括两个部分:生成不同类的size的cluster结果;利用聚类结果生成不同类的Anchor信息,作为RPN的输入数据
Anchor信息具体为:[max_gt_2d_iou, max_gt_3d_iou, (6 x offsets), class_index],anchor对应的gt_iou(2d和3d),anchor偏移值,对应类的index
具体步骤为:
- 先生成anchor_stride(默认为0.5)的3d anchor
- 生成voxel 2d图,进行empty-anchor的过滤
- anchors与gt进行iou的计算,确定与生成的anchor iou最高的类,更新offsets与class_index
核心代码如下:
# mini_batch_preprocessor.py:49 def preprocess(self, indices): """Preprocesses anchor info and saves info to files Args: indices (int array): sample indices to process. If None, processes all samples """ # Get anchor stride for class,默认为0.5 anchor_strides = self._anchor_strides dataset = self._dataset dataset_utils = self._dataset.kitti_utils classes_name = dataset.classes_name # Make folder if it doesn't exist yet output_dir = self.mini_batch_utils.get_file_path(classes_name, anchor_strides, sample_name=None) os.makedirs(output_dir, exist_ok=True) # Get clusters for class # 生成的cluster size用于anchor size的生成 all_clusters_sizes, _ = dataset.get_cluster_info() # 初始化3d_anchor_generator anchor_generator = grid_anchor_3d_generator.GridAnchor3dGenerator() # Load indices of data_split all_samples = dataset.sample_list if indices is None: indices = np.arange(len(all_samples)) num_samples = len(indices) # For each image in the dataset, save info on the anchors for sample_idx in indices: # Get image name for given cluster sample_name = all_samples[sample_idx].name img_idx = int(sample_name) # Check for existing files and skip to the next if self._check_for_existing(classes_name, anchor_strides, sample_name): print("{} / {}: Sample already preprocessed".format( sample_idx + 1, num_samples, sample_name)) continue # Get ground truth and filter based on difficulty ground_truth_list = obj_utils.read_labels(dataset.label_dir, img_idx) # Filter objects to dataset classes filtered_gt_list = dataset_utils.filter_labels(ground_truth_list) filtered_gt_list = np.asarray(filtered_gt_list) # Filtering by class has no valid ground truth, skip this image if len(filtered_gt_list) == 0: print("{} / {} No {}s for sample {} " "(Ground Truth Filter)".format( sample_idx + 1, num_samples, classes_name, sample_name)) # Output an empty file and move on to the next image. self._save_to_file(classes_name, anchor_strides, sample_name) continue # Get ground plane ground_plane = obj_utils.get_road_plane(img_idx, dataset.planes_dir) image = Image.open(dataset.get_rgb_image_path(sample_name)) image_shape = [image.size[1], image.size[0]] # Generate sliced 2D voxel grid for filtering # 生成2d voxel grid,这里只保留了image视角内bev图信息 vx_grid_2d = dataset_utils.create_sliced_voxel_grid_2d( sample_name, source=dataset.bev_source, image_shape=image_shape) # List for merging all anchors all_anchor_boxes_3d = [] # Create anchors for each class for class_idx in range(len(dataset.classes)): # Generate anchors for all classes # 根据不同class的anchor大小以及stride和plane生成3d anchor grid_anchor_boxes_3d = anchor_generator.generate( area_3d=self._area_extents, anchor_3d_sizes=all_clusters_sizes[class_idx], anchor_stride=self._anchor_strides[class_idx], ground_plane=ground_plane) all_anchor_boxes_3d.extend(grid_anchor_boxes_3d) # Filter empty anchors all_anchor_boxes_3d = np.asarray(all_anchor_boxes_3d) anchors = box_3d_encoder.box_3d_to_anchor(all_anchor_boxes_3d) empty_anchor_filter = anchor_filter.get_empty_anchor_filter_2d( anchors, vx_grid_2d, self._density_threshold) # Calculate anchor info # 这里更新了所有anchor和gt的iou信息,以找到anchor匹配的目标target anchors_info = self._calculate_anchors_info( all_anchor_boxes_3d, empty_anchor_filter, filtered_gt_list) anchor_ious = anchors_info[:, self.mini_batch_utils.col_ious] valid_iou_indices = np.where(anchor_ious > 0.0)[0] print("{} / {}:" "{:>6} anchors, " "{:>6} iou > 0.0, " "for {:>3} {}(s) for sample {}".format( sample_idx + 1, num_samples, len(anchors_info), len(valid_iou_indices), len(filtered_gt_list), classes_name, sample_name )) # Save anchors info self._save_to_file(classes_name, anchor_strides, sample_name, anchors_info)
其中3D Anchor生成的步骤:
-
确定Anchor生成范围(area_extents)
-
根据stride生成anchor的center点分布
-
生成size和rotation分布->生成anchor matrix
def tile_anchors_3d(area_extents, anchor_3d_sizes, anchor_stride, ground_plane): """ Tiles anchors over the area extents by using meshgrids to generate combinations of (x, y, z), (l, w, h) and ry. Args: area_extents: [[min_x, max_x], [min_y, max_y], [min_z, max_z]] anchor_3d_sizes: list of 3d anchor sizes N x (l, w, h) anchor_stride: stride lengths (x_stride, z_stride) ground_plane: coefficients of the ground plane e.g. [0, -1, 0, 0] Returns: boxes: list of 3D anchors in box_3d format N x [x, y, z, l, w, h, ry] """ # Convert sizes to ndarray # 由于kitti坐标系的原因:x,z轴定义的为地平面坐标系,而y轴对应高度 anchor_3d_sizes = np.asarray(anchor_3d_sizes) anchor_stride_x = anchor_stride[0] anchor_stride_z = anchor_stride[1] anchor_rotations = np.asarray([0, np.pi / 2.0]) x_start = area_extents[0][0] + anchor_stride[0] / 2.0 x_end = area_extents[0][1] x_centers = np.array(np.arange(x_start, x_end, step=anchor_stride_x), dtype=np.float32) z_start = area_extents[2][1] - anchor_stride[1] / 2.0 z_end = area_extents[2][0] z_centers = np.array(np.arange(z_start, z_end, step=-anchor_stride_z), dtype=np.float32) # Use ranges for substitution size_indices = np.arange(0, len(anchor_3d_sizes)) rotation_indices = np.arange(0, len(anchor_rotations)) # Generate matrix for substitution # e.g. for two sizes and two rotations # [[x0, z0, 0, 0], [x0, z0, 0, 1], [x0, z0, 1, 0], [x0, z0, 1, 1], # [x1, z0, 0, 0], [x1, z0, 0, 1], [x1, z0, 1, 0], [x1, z0, 1, 1], ...] before_sub = np.stack(np.meshgrid(x_centers, z_centers, size_indices, rotation_indices), axis=4).reshape(-1, 4) # Place anchors on the ground plane # 利用之前的meshgrid生成anchor的center点 a, b, c, d = ground_plane all_x = before_sub[:, 0] all_z = before_sub[:, 1] all_y = -(a * all_x + c * all_z + d) / b # Create empty matrix to return num_anchors = len(before_sub) all_anchor_boxes_3d = np.zeros((num_anchors, 7)) # Fill in x, y, z all_anchor_boxes_3d[:, 0:3] = np.stack((all_x, all_y, all_z), axis=1) # Fill in shapes sizes = anchor_3d_sizes[np.asarray(before_sub[:, 2], np.int32)] all_anchor_boxes_3d[:, 3:6] = sizes # Fill in rotations rotations = anchor_rotations[np.asarray(before_sub[:, 3], np.int32)] all_anchor_boxes_3d[:, 6] = rotations return all_anchor_boxes_3d
模型训练
avod模型的整体结构包括backbone+RPN+avod网络三个部分,详情参照avod_paperreading
backbone采用的是VGG+FPN的结构,但是添加了bev feature的设计(lidar三维数据转化为二维的bev特征),后与image feature进行融合,RPN网络用于生成region proposal,avod用于最后物体的分类和检测框的回归
调用链
base_dir = avod/
主要的相关模块调用:
config = avod/config/pyramid_cars_with_aug_example.config
scripts/run_training.py->avod/avod/core/trainer.py(这里会完成model,input_data,loss,op等模块的构建)->avod/avod/core/models/avod_model.py->avod/avod/core/models/rpn_model.py
核心部分
-
数据前处理
训练的数据前处理与前文的预生成数据的区别是这里是对输入的原始数据进行处理,主要分为以下几个部分:
-
三维点云数据的读取和过滤:
三维点云数据读入后需要进行去除在image视角外的点云数据包括两个部分:ground_plane_filter+image_filter,前者主要用于生成bev图特征(对应不同高度生成不同体素空间,进行点的特征编码,参照bev的生成),后者主要是将对应cam view外的点进行过滤。
-
BEV图的生成
BEV图生成原理是在过滤后的点云数据上,根据height_lo和height_hi的高度范围(相对于ground_plane)生成num_slices个y轴维度的切片(slices)每个切片上按照voxel_size生成一系列单元(voxel),以其中点云的最高点高度作为feature,最终生成(bev_width/voxel_size)*(bev_height/voxel_size)*(num_slices+1)维特征,+1为记录的density信息,代码如下
#avod/acod/datasets/kitti/kitti_utils.py:109 def generate_bev(self, source, point_cloud, ground_plane, area_extents, voxel_size): """Generates the BEV maps dictionary. One height map is created for each slice of the point cloud. One density map is created for the whole point cloud. Args: source: point cloud source point_cloud: point cloud (3, N) ground_plane: ground plane coefficients area_extents: 3D area extents [[min_x, max_x], [min_y, max_y], [min_z, max_z]] voxel_size: voxel size in m Returns: BEV maps dictionary height_maps: list of height maps density_map: density map """ #得到点云数据 all_points = np.transpose(point_cloud) height_maps = [] for slice_idx in range(self.num_slices): height_lo = self.height_lo + slice_idx * self.height_per_division height_hi = height_lo + self.height_per_division #slice_filter相对ground_plane根据高度进行每个slice点云的过滤 slice_filter = self.kitti_utils.create_slice_filter( point_cloud, area_extents, ground_plane, height_lo, height_hi) # Apply slice filter slice_points = all_points[slice_filter] if len(slice_points) > 1: # Create Voxel Grid 2D voxel_grid_2d = VoxelGrid2D() voxel_grid_2d.voxelize_2d( slice_points, voxel_size, extents=area_extents, ground_plane=ground_plane, create_leaf_layout=False) # Remove y values (all 0) voxel_indices = voxel_grid_2d.voxel_indices[:, [0, 2]] # Create empty BEV images height_map = np.zeros((voxel_grid_2d.num_divisions[0], voxel_grid_2d.num_divisions[2])) # Only update pixels where voxels have max height values, # and normalize by height of slices # 生成含有最大高度信息的height_map voxel_grid_2d.heights = voxel_grid_2d.heights - height_lo height_map[voxel_indices[:, 0], voxel_indices[:, 1]] = \ np.asarray(voxel_grid_2d.heights) / self.height_per_division height_maps.append(height_map) # Rotate height maps 90 degrees # (transpose and flip) is faster than np.rot90 # 应该是坐标系定义的问题(image和bev) height_maps_out = [np.flip(height_maps[map_idx].transpose(), axis=0) for map_idx in range(len(height_maps))] #得到density的filter,在全量高度上得到 density_slice_filter = self.kitti_utils.create_slice_filter( point_cloud, area_extents, ground_plane, self.height_lo, self.height_hi) density_points = all_points[density_slice_filter] # Create Voxel Grid 2D density_voxel_grid_2d = VoxelGrid2D() density_voxel_grid_2d.voxelize_2d( density_points, voxel_size, extents=area_extents, ground_plane=ground_plane, create_leaf_layout=False) # Generate density map density_voxel_indices_2d = \ density_voxel_grid_2d.voxel_indices[:, [0, 2]] density_map = self._create_density_map( num_divisions=density_voxel_grid_2d.num_divisions, voxel_indices_2d=density_voxel_indices_2d, num_pts_per_voxel=density_voxel_grid_2d.num_pts_in_voxel, norm_value=self.NORM_VALUES[source]) bev_maps = dict() bev_maps['height_maps'] = height_maps_out bev_maps['density_map'] = density_map return bev_maps
-
数据增强(data augumentation)
这部分主要是在读入数据的过程中会进行数据的增强操作,默认car的增强操作包括:flipping+pca_jitter。
-
-
Backbone
backbone(feature extactor)包括两个部分:bev和image,整体结构类似,具体实现参考下文代码,其结构可以概述为conv1*2->pool1->conv2*2->pool2->conv3*2->pool3->conv4->(upconv3+concat3+fusion3)->(upconv2+concat2+fusion2)->(upconv1+concat1+fusion1)
#avod/core/feature_extractors/bev_vgg_pyramid.py:30 def build(self, inputs, input_pixel_size, is_training, scope='bev_vgg_pyr'): """ Modified VGG for BEV feature extraction with pyramid features Args: inputs: a tensor of size [batch_size, height, width, channels]. input_pixel_size: size of the input (H x W) is_training: True for training, False for validation/testing. scope: Optional scope for the variables. Returns: The last op containing the log predictions and end_points dict. """ vgg_config = self.config with slim.arg_scope(self.vgg_arg_scope( weight_decay=vgg_config.l2_weight_decay)): with tf.variable_scope(scope, 'bev_vgg_pyr', [inputs]) as sc: end_points_collection = sc.name + '_end_points' # Collect outputs for conv2d, fully_connected and max_pool2d. with slim.arg_scope([slim.conv2d, slim.max_pool2d], outputs_collections=end_points_collection): # Pad 700 to 704 to allow even divisions for max pooling padded = tf.pad(inputs, [[0, 0], [4, 0], [0, 0], [0, 0]]) # Encoder conv1 = slim.repeat(padded, vgg_config.vgg_conv1[0], slim.conv2d, vgg_config.vgg_conv1[1], [3, 3], normalizer_fn=slim.batch_norm, normalizer_params={ 'is_training': is_training}, scope='conv1') pool1 = slim.max_pool2d(conv1, [2, 2], scope='pool1') conv2 = slim.repeat(pool1, vgg_config.vgg_conv2[0], slim.conv2d, vgg_config.vgg_conv2[1], [3, 3], normalizer_fn=slim.batch_norm, normalizer_params={ 'is_training': is_training}, scope='conv2') pool2 = slim.max_pool2d(conv2, [2, 2], scope='pool2') conv3 = slim.repeat(pool2, vgg_config.vgg_conv3[0], slim.conv2d, vgg_config.vgg_conv3[1], [3, 3], normalizer_fn=slim.batch_norm, normalizer_params={ 'is_training': is_training}, scope='conv3') pool3 = slim.max_pool2d(conv3, [2, 2], scope='pool3') conv4 = slim.repeat(pool3, vgg_config.vgg_conv4[0], slim.conv2d, vgg_config.vgg_conv4[1], [3, 3], normalizer_fn=slim.batch_norm, normalizer_params={ 'is_training': is_training}, scope='conv4') # Decoder (upsample and fuse features) upconv3 = slim.conv2d_transpose( conv4, vgg_config.vgg_conv3[1], [3, 3], stride=2, normalizer_fn=slim.batch_norm, normalizer_params={ 'is_training': is_training}, scope='upconv3') concat3 = tf.concat( (conv3, upconv3), axis=3, name='concat3') pyramid_fusion3 = slim.conv2d( concat3, vgg_config.vgg_conv2[1], [3, 3], normalizer_fn=slim.batch_norm, normalizer_params={ 'is_training': is_training}, scope='pyramid_fusion3') upconv2 = slim.conv2d_transpose( pyramid_fusion3, vgg_config.vgg_conv2[1], [3, 3], stride=2, normalizer_fn=slim.batch_norm, normalizer_params={ 'is_training': is_training}, scope='upconv2') concat2 = tf.concat( (conv2, upconv2), axis=3, name='concat2') pyramid_fusion_2 = slim.conv2d( concat2, vgg_config.vgg_conv1[1], [3, 3], normalizer_fn=slim.batch_norm, normalizer_params={ 'is_training': is_training}, scope='pyramid_fusion2') upconv1 = slim.conv2d_transpose( pyramid_fusion_2, vgg_config.vgg_conv1[1], [3, 3], stride=2, normalizer_fn=slim.batch_norm, normalizer_params={ 'is_training': is_training}, scope='upconv1') concat1 = tf.concat( (conv1, upconv1), axis=3, name='concat1') pyramid_fusion1 = slim.conv2d( concat1, vgg_config.vgg_conv1[1], [3, 3], normalizer_fn=slim.batch_norm, normalizer_params={ 'is_training': is_training}, scope='pyramid_fusion1') # Slice off padded area sliced = pyramid_fusion1[:, 4:] feature_maps_out = sliced # Convert end_points_collection into a end_point dict. end_points = slim.utils.convert_collection_to_dict( end_points_collection) return feature_maps_out, end_points
-
RPN Model
Backbone(feature extraction)出来的feature会分别经过一个1*1的卷积(bottle_neck)生成proposal网络的input_feature。默认配置设置了path_drop:image和bev两个path会有一定的几率没有输入,类似于drop_out(具体参考avod/avod/core/models/rpn.py:create_path_drop_masks)。之后会将得到的3d anchor映射到bev图和image图上,前者直接投影到ground_plane上,后者通过lidar坐标和image坐标的映射关系得到(取最大的2d框)。之后根据config中的roi_crop_size将得到的proposal feature进行crop_and_resize到相同尺寸。之后会做特征的fusion(默认采用mean fusion),fusioned feature会通过两个分支:3层卷积(论文中为fc,实际代码中为convd)组成的objectness和offsets的预测,这样就形成了first stage的proposal,之后proposal一方面会通过top-k的nms(注意这里的nms是所有类共同做的nms结果)作为second stage的输入,另一方面通过gen_mini_batch生成mini-batch(默认为512个samples,正负例各一半)计算objectness和regression loss(smooth l1),值得注意的是这里的是生成mini-batch的方式采用的是random shuffile的方式,即先shuffle一半的正例(256),如果不足的话用负例补充,没有考虑类比不平衡的问题,所以会造成小样本类别物体收敛慢甚至不收敛的问题。其build 网络部分代码如下:
#rpn_model.py:280, deteled some code for summary def build(self): # Setup input placeholders self._set_up_input_pls() # Setup feature extractors self._set_up_feature_extractors() bev_proposal_input = self.bev_bottleneck img_proposal_input = self.img_bottleneck fusion_mean_div_factor = 2.0 # If both img and bev probabilites are set to 1.0, don't do # path drop. if not (self._path_drop_probabilities[0] == self._path_drop_probabilities[1] == 1.0): with tf.variable_scope('rpn_path_drop'): random_values = tf.random_uniform(shape=[3], minval=0.0, maxval=1.0) img_mask, bev_mask = self.create_path_drop_masks( self._path_drop_probabilities[0], self._path_drop_probabilities[1], random_values) img_proposal_input = tf.multiply(img_proposal_input, img_mask) bev_proposal_input = tf.multiply(bev_proposal_input, bev_mask) self.img_path_drop_mask = img_mask self.bev_path_drop_mask = bev_mask # Overwrite the division factor fusion_mean_div_factor = img_mask + bev_mask with tf.variable_scope('proposal_roi_pooling'): with tf.variable_scope('box_indices'): def get_box_indices(boxes): proposals_shape = boxes.get_shape().as_list() if any(dim is None for dim in proposals_shape): proposals_shape = tf.shape(boxes) ones_mat = tf.ones(proposals_shape[:2], dtype=tf.int32) multiplier = tf.expand_dims( tf.range(start=0, limit=proposals_shape[0]), 1) return tf.reshape(ones_mat * multiplier, [-1]) bev_boxes_norm_batches = tf.expand_dims( self._bev_anchors_norm_pl, axis=0) # These should be all 0's since there is only 1 image tf_box_indices = get_box_indices(bev_boxes_norm_batches) # Do ROI Pooling on BEV bev_proposal_rois = tf.image.crop_and_resize( bev_proposal_input, self._bev_anchors_norm_pl, tf_box_indices, self._proposal_roi_crop_size) # Do ROI Pooling on image img_proposal_rois = tf.image.crop_and_resize( img_proposal_input, self._img_anchors_norm_pl, tf_box_indices, self._proposal_roi_crop_size) with tf.variable_scope('proposal_roi_fusion'): rpn_fusion_out = None if self._fusion_method == 'mean': tf_features_sum = tf.add(bev_proposal_rois, img_proposal_rois) rpn_fusion_out = tf.divide(tf_features_sum, fusion_mean_div_factor) elif self._fusion_method == 'concat': rpn_fusion_out = tf.concat( [bev_proposal_rois, img_proposal_rois], axis=3) else: raise ValueError('Invalid fusion method', self._fusion_method) # TODO: move this section into an separate AnchorPredictor class with tf.variable_scope('anchor_predictor', 'ap', [rpn_fusion_out]): tensor_in = rpn_fusion_out # Parse rpn layers config layers_config = self._config.layers_config.rpn_config l2_weight_decay = layers_config.l2_weight_decay if l2_weight_decay > 0: weights_regularizer = slim.l2_regularizer(l2_weight_decay) else: weights_regularizer = None with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer): # Use conv2d instead of fully_connected layers. cls_fc6 = slim.conv2d(tensor_in, layers_config.cls_fc6, self._proposal_roi_crop_size, padding='VALID', scope='cls_fc6') cls_fc6_drop = slim.dropout(cls_fc6, layers_config.keep_prob, is_training=self._is_training, scope='cls_fc6_drop') cls_fc7 = slim.conv2d(cls_fc6_drop, layers_config.cls_fc7, [1, 1], scope='cls_fc7') cls_fc7_drop = slim.dropout(cls_fc7, layers_config.keep_prob, is_training=self._is_training, scope='cls_fc7_drop') cls_fc8 = slim.conv2d(cls_fc7_drop, 2, [1, 1], activation_fn=None, scope='cls_fc8') objectness = tf.squeeze( cls_fc8, [1, 2], name='cls_fc8/squeezed') # Use conv2d instead of fully_connected layers. reg_fc6 = slim.conv2d(tensor_in, layers_config.reg_fc6, self._proposal_roi_crop_size, padding='VALID', scope='reg_fc6') reg_fc6_drop = slim.dropout(reg_fc6, layers_config.keep_prob, is_training=self._is_training, scope='reg_fc6_drop') reg_fc7 = slim.conv2d(reg_fc6_drop, layers_config.reg_fc7, [1, 1], scope='reg_fc7') reg_fc7_drop = slim.dropout(reg_fc7, layers_config.keep_prob, is_training=self._is_training, scope='reg_fc7_drop') reg_fc8 = slim.conv2d(reg_fc7_drop, 6, [1, 1], activation_fn=None, scope='reg_fc8') offsets = tf.squeeze( reg_fc8, [1, 2], name='reg_fc8/squeezed') # Return the proposals with tf.variable_scope('proposals'): anchors = self.placeholders[self.PL_ANCHORS] # Decode anchor regression offsets with tf.variable_scope('decoding'): regressed_anchors = anchor_encoder.offset_to_anchor( anchors, offsets) with tf.variable_scope('bev_projection'): _, bev_proposal_boxes_norm = anchor_projector.project_to_bev( regressed_anchors, self._bev_extents) with tf.variable_scope('softmax'): objectness_softmax = tf.nn.softmax(objectness) with tf.variable_scope('nms'): objectness_scores = objectness_softmax[:, 1] # Do NMS on regressed anchors top_indices = tf.image.non_max_suppression( bev_proposal_boxes_norm, objectness_scores, max_output_size=self._nms_size, iou_threshold=self._nms_iou_thresh) top_anchors = tf.gather(regressed_anchors, top_indices) top_objectness_softmax = tf.gather(objectness_scores, top_indices) # top_offsets = tf.gather(offsets, top_indices) # top_objectness = tf.gather(objectness, top_indices) # Get mini batch all_ious_gt = self.placeholders[self.PL_ANCHOR_IOUS] all_offsets_gt = self.placeholders[self.PL_ANCHOR_OFFSETS] all_classes_gt = self.placeholders[self.PL_ANCHOR_CLASSES] with tf.variable_scope('mini_batch'): mini_batch_utils = self.dataset.kitti_utils.mini_batch_utils mini_batch_mask, _ = \ mini_batch_utils.sample_rpn_mini_batch(all_ious_gt) # Ground Truth Tensors with tf.variable_scope('one_hot_classes'): # Anchor classification ground truth # Object / Not Object min_pos_iou = \ self.dataset.kitti_utils.mini_batch_utils.rpn_pos_iou_range[0] objectness_classes_gt = tf.cast( tf.greater_equal(all_ious_gt, min_pos_iou), dtype=tf.int32) objectness_gt = tf.one_hot( objectness_classes_gt, depth=2, on_value=1.0 - self._config.label_smoothing_epsilon, off_value=self._config.label_smoothing_epsilon) # Mask predictions for mini batch with tf.variable_scope('prediction_mini_batch'): objectness_masked = tf.boolean_mask(objectness, mini_batch_mask) offsets_masked = tf.boolean_mask(offsets, mini_batch_mask) with tf.variable_scope('ground_truth_mini_batch'): objectness_gt_masked = tf.boolean_mask( objectness_gt, mini_batch_mask) offsets_gt_masked = tf.boolean_mask(all_offsets_gt, mini_batch_mask) # Specify the tensors to evaluate predictions = dict() # Temporary predictions for debugging # predictions['anchor_ious'] = anchor_ious # predictions['anchor_offsets'] = all_offsets_gt if self._train_val_test in ['train', 'val']: # All anchors predictions[self.PRED_ANCHORS] = anchors # Mini-batch masks predictions[self.PRED_MB_MASK] = mini_batch_mask # Mini-batch predictions predictions[self.PRED_MB_OBJECTNESS] = objectness_masked predictions[self.PRED_MB_OFFSETS] = offsets_masked # Mini batch ground truth predictions[self.PRED_MB_OFFSETS_GT] = offsets_gt_masked predictions[self.PRED_MB_OBJECTNESS_GT] = objectness_gt_masked # Proposals after nms predictions[self.PRED_TOP_INDICES] = top_indices predictions[self.PRED_TOP_ANCHORS] = top_anchors predictions[ self.PRED_TOP_OBJECTNESS_SOFTMAX] = top_objectness_softmax else: # self._train_val_test == 'test' predictions[self.PRED_TOP_ANCHORS] = top_anchors predictions[ self.PRED_TOP_OBJECTNESS_SOFTMAX] = top_objectness_softmax return predictions
-
AVOD Model
AVOD网络部分会得到first stage得到的top-k anchor proposals,得到对应bev和img的anchor projection,进行相同的crop_and_resize操作,之后再进行fusion+n*(fc+fc_drop)进行cls,offsets以及angle vector的预测(fusion默认采用early-fusion:即先进行fusion再进入之后网络层)。生成prediction之后,会解码gt投影到bev图上,然后采用同样的策略生成mini-batch和top-anchor(bev上进行的nms),并且生成对应的objecness,offset,angle的loss。mini-batch的loss作为train过程中进行模型训练,后者生成最终的预测,但是loss好像并没有使用。其中,offset的loss需要转化到3d box上去计算(论文提出的box_4c计算方式)。相关代码如下:
#avod_model.py:123 deleted code for summary def build(self): rpn_model = self._rpn_model # Share the same prediction dict as RPN prediction_dict = rpn_model.build() top_anchors = prediction_dict[RpnModel.PRED_TOP_ANCHORS] ground_plane = rpn_model.placeholders[RpnModel.PL_GROUND_PLANE] class_labels = rpn_model.placeholders[RpnModel.PL_LABEL_CLASSES] with tf.variable_scope('avod_projection'): if self._config.expand_proposals_xz > 0.0: expand_length = self._config.expand_proposals_xz # Expand anchors along x and z with tf.variable_scope('expand_xz'): expanded_dim_x = top_anchors[:, 3] + expand_length expanded_dim_z = top_anchors[:, 5] + expand_length expanded_anchors = tf.stack([ top_anchors[:, 0], top_anchors[:, 1], top_anchors[:, 2], expanded_dim_x, top_anchors[:, 4], expanded_dim_z ], axis=1) avod_projection_in = expanded_anchors else: avod_projection_in = top_anchors with tf.variable_scope('bev'): # Project top anchors into bev and image spaces bev_proposal_boxes, bev_proposal_boxes_norm = \ anchor_projector.project_to_bev( avod_projection_in, self.dataset.kitti_utils.bev_extents) # Reorder projected boxes into [y1, x1, y2, x2] bev_proposal_boxes_tf_order = \ anchor_projector.reorder_projected_boxes( bev_proposal_boxes) bev_proposal_boxes_norm_tf_order = \ anchor_projector.reorder_projected_boxes( bev_proposal_boxes_norm) with tf.variable_scope('img'): image_shape = tf.cast(tf.shape( rpn_model.placeholders[RpnModel.PL_IMG_INPUT])[0:2], tf.float32) img_proposal_boxes, img_proposal_boxes_norm = \ anchor_projector.tf_project_to_image_space( avod_projection_in, rpn_model.placeholders[RpnModel.PL_CALIB_P2], image_shape) # Only reorder the normalized img img_proposal_boxes_norm_tf_order = \ anchor_projector.reorder_projected_boxes( img_proposal_boxes_norm) bev_feature_maps = rpn_model.bev_feature_maps img_feature_maps = rpn_model.img_feature_maps if not (self._path_drop_probabilities[0] == self._path_drop_probabilities[1] == 1.0): with tf.variable_scope('avod_path_drop'): img_mask = rpn_model.img_path_drop_mask bev_mask = rpn_model.bev_path_drop_mask img_feature_maps = tf.multiply(img_feature_maps, img_mask) bev_feature_maps = tf.multiply(bev_feature_maps, bev_mask) else: bev_mask = tf.constant(1.0) img_mask = tf.constant(1.0) # ROI Pooling with tf.variable_scope('avod_roi_pooling'): def get_box_indices(boxes): proposals_shape = boxes.get_shape().as_list() if any(dim is None for dim in proposals_shape): proposals_shape = tf.shape(boxes) ones_mat = tf.ones(proposals_shape[:2], dtype=tf.int32) multiplier = tf.expand_dims( tf.range(start=0, limit=proposals_shape[0]), 1) return tf.reshape(ones_mat * multiplier, [-1]) bev_boxes_norm_batches = tf.expand_dims( bev_proposal_boxes_norm, axis=0) # These should be all 0's since there is only 1 image tf_box_indices = get_box_indices(bev_boxes_norm_batches) # Do ROI Pooling on BEV bev_rois = tf.image.crop_and_resize( bev_feature_maps, bev_proposal_boxes_norm_tf_order, tf_box_indices, self._proposal_roi_crop_size, name='bev_rois') # Do ROI Pooling on image img_rois = tf.image.crop_and_resize( img_feature_maps, img_proposal_boxes_norm_tf_order, tf_box_indices, self._proposal_roi_crop_size, name='img_rois') # Fully connected layers (Box Predictor) avod_layers_config = self.model_config.layers_config.avod_config fc_output_layers = \ avod_fc_layers_builder.build( layers_config=avod_layers_config, input_rois=[bev_rois, img_rois], input_weights=[bev_mask, img_mask], num_final_classes=self._num_final_classes, box_rep=self._box_rep, top_anchors=top_anchors, ground_plane=ground_plane, is_training=self._is_training) all_cls_logits = \ fc_output_layers[avod_fc_layers_builder.KEY_CLS_LOGITS] all_offsets = fc_output_layers[avod_fc_layers_builder.KEY_OFFSETS] # This may be None all_angle_vectors = \ fc_output_layers.get(avod_fc_layers_builder.KEY_ANGLE_VECTORS) with tf.variable_scope('softmax'): all_cls_softmax = tf.nn.softmax( all_cls_logits) ###################################################### # Subsample mini_batch for the loss function ###################################################### # Get the ground truth tensors anchors_gt = rpn_model.placeholders[RpnModel.PL_LABEL_ANCHORS] if self._box_rep in ['box_3d', 'box_4ca']: boxes_3d_gt = rpn_model.placeholders[RpnModel.PL_LABEL_BOXES_3D] orientations_gt = boxes_3d_gt[:, 6] elif self._box_rep in ['box_8c', 'box_8co', 'box_4c']: boxes_3d_gt = rpn_model.placeholders[RpnModel.PL_LABEL_BOXES_3D] else: raise NotImplementedError('Ground truth tensors not implemented') # Project anchor_gts to 2D bev with tf.variable_scope('avod_gt_projection'): bev_anchor_boxes_gt, _ = anchor_projector.project_to_bev( anchors_gt, self.dataset.kitti_utils.bev_extents) bev_anchor_boxes_gt_tf_order = \ anchor_projector.reorder_projected_boxes(bev_anchor_boxes_gt) with tf.variable_scope('avod_box_list'): # Convert to box_list format anchor_box_list_gt = box_list.BoxList(bev_anchor_boxes_gt_tf_order) anchor_box_list = box_list.BoxList(bev_proposal_boxes_tf_order) #得到minibatch的mask,label index和对应的匹配到的gt index mb_mask, mb_class_label_indices, mb_gt_indices = \ self.sample_mini_batch( anchor_box_list_gt=anchor_box_list_gt, anchor_box_list=anchor_box_list, class_labels=class_labels) # Create classification one_hot vector with tf.variable_scope('avod_one_hot_classes'): mb_classification_gt = tf.one_hot( mb_class_label_indices, depth=self._num_final_classes, on_value=1.0 - self._config.label_smoothing_epsilon, off_value=(self._config.label_smoothing_epsilon / self.dataset.num_classes)) # TODO: Don't create a mini batch in test mode # Mask predictions with tf.variable_scope('avod_apply_mb_mask'): # Classification mb_classifications_logits = tf.boolean_mask( all_cls_logits, mb_mask) mb_classifications_softmax = tf.boolean_mask( all_cls_softmax, mb_mask) # Offsets mb_offsets = tf.boolean_mask(all_offsets, mb_mask) # Angle Vectors if all_angle_vectors is not None: mb_angle_vectors = tf.boolean_mask(all_angle_vectors, mb_mask) else: mb_angle_vectors = None # Encode anchor offsets with tf.variable_scope('avod_encode_mb_anchors'): mb_anchors = tf.boolean_mask(top_anchors, mb_mask) if self._box_rep == 'box_3d': # Gather corresponding ground truth anchors for each mb sample mb_anchors_gt = tf.gather(anchors_gt, mb_gt_indices) mb_offsets_gt = anchor_encoder.tf_anchor_to_offset( mb_anchors, mb_anchors_gt) # Gather corresponding ground truth orientation for each # mb sample mb_orientations_gt = tf.gather(orientations_gt, mb_gt_indices) elif self._box_rep in ['box_8c', 'box_8co']: # Get boxes_3d ground truth mini-batch and convert to box_8c mb_boxes_3d_gt = tf.gather(boxes_3d_gt, mb_gt_indices) if self._box_rep == 'box_8c': mb_boxes_8c_gt = \ box_8c_encoder.tf_box_3d_to_box_8c(mb_boxes_3d_gt) elif self._box_rep == 'box_8co': mb_boxes_8c_gt = \ box_8c_encoder.tf_box_3d_to_box_8co(mb_boxes_3d_gt) # Convert proposals: anchors -> box_3d -> box8c proposal_boxes_3d = \ box_3d_encoder.anchors_to_box_3d(top_anchors, fix_lw=True) proposal_boxes_8c = \ box_8c_encoder.tf_box_3d_to_box_8c(proposal_boxes_3d) # Get mini batch offsets mb_boxes_8c = tf.boolean_mask(proposal_boxes_8c, mb_mask) mb_offsets_gt = box_8c_encoder.tf_box_8c_to_offsets( mb_boxes_8c, mb_boxes_8c_gt) # Flatten the offsets to a (N x 24) vector mb_offsets_gt = tf.reshape(mb_offsets_gt, [-1, 24]) elif self._box_rep in ['box_4c', 'box_4ca']: # Get ground plane for box_4c conversion ground_plane = self._rpn_model.placeholders[ self._rpn_model.PL_GROUND_PLANE] # Convert gt boxes_3d -> box_4c mb_boxes_3d_gt = tf.gather(boxes_3d_gt, mb_gt_indices) mb_boxes_4c_gt = box_4c_encoder.tf_box_3d_to_box_4c( mb_boxes_3d_gt, ground_plane) # Convert proposals: anchors -> box_3d -> box_4c proposal_boxes_3d = \ box_3d_encoder.anchors_to_box_3d(top_anchors, fix_lw=True) proposal_boxes_4c = \ box_4c_encoder.tf_box_3d_to_box_4c(proposal_boxes_3d, ground_plane) # Get mini batch mb_boxes_4c = tf.boolean_mask(proposal_boxes_4c, mb_mask) mb_offsets_gt = box_4c_encoder.tf_box_4c_to_offsets( mb_boxes_4c, mb_boxes_4c_gt) if self._box_rep == 'box_4ca': # Gather corresponding ground truth orientation for each # mb sample mb_orientations_gt = tf.gather(orientations_gt, mb_gt_indices) else: raise NotImplementedError( 'Anchor encoding not implemented for', self._box_rep) ###################################################### # Final Predictions ###################################################### # Get orientations from angle vectors if all_angle_vectors is not None: with tf.variable_scope('avod_orientation'): all_orientations = \ orientation_encoder.tf_angle_vector_to_orientation( all_angle_vectors) # Apply offsets to regress proposals with tf.variable_scope('avod_regression'): if self._box_rep == 'box_3d': prediction_anchors = \ anchor_encoder.offset_to_anchor(top_anchors, all_offsets) elif self._box_rep in ['box_8c', 'box_8co']: # Reshape the 24-dim regressed offsets to (N x 3 x 8) reshaped_offsets = tf.reshape(all_offsets, [-1, 3, 8]) # Given the offsets, get the boxes_8c prediction_boxes_8c = \ box_8c_encoder.tf_offsets_to_box_8c(proposal_boxes_8c, reshaped_offsets) # Convert corners back to box3D prediction_boxes_3d = \ box_8c_encoder.box_8c_to_box_3d(prediction_boxes_8c) # Convert the box_3d to anchor format for nms prediction_anchors = \ box_3d_encoder.tf_box_3d_to_anchor(prediction_boxes_3d) elif self._box_rep in ['box_4c', 'box_4ca']: # Convert predictions box_4c -> box_3d prediction_boxes_4c = \ box_4c_encoder.tf_offsets_to_box_4c(proposal_boxes_4c, all_offsets) prediction_boxes_3d = \ box_4c_encoder.tf_box_4c_to_box_3d(prediction_boxes_4c, ground_plane) # Convert to anchor format for nms prediction_anchors = \ box_3d_encoder.tf_box_3d_to_anchor(prediction_boxes_3d) else: raise NotImplementedError('Regression not implemented for', self._box_rep) # Apply Non-oriented NMS in BEV with tf.variable_scope('avod_nms'): bev_extents = self.dataset.kitti_utils.bev_extents with tf.variable_scope('bev_projection'): # Project predictions into BEV avod_bev_boxes, _ = anchor_projector.project_to_bev( prediction_anchors, bev_extents) avod_bev_boxes_tf_order = \ anchor_projector.reorder_projected_boxes( avod_bev_boxes) # Get top score from second column onward all_top_scores = tf.reduce_max(all_cls_logits[:, 1:], axis=1) # Apply NMS in BEV nms_indices = tf.image.non_max_suppression( avod_bev_boxes_tf_order, all_top_scores, max_output_size=self._nms_size, iou_threshold=self._nms_iou_threshold) # Gather predictions from NMS indices top_classification_logits = tf.gather(all_cls_logits, nms_indices) top_classification_softmax = tf.gather(all_cls_softmax, nms_indices) top_prediction_anchors = tf.gather(prediction_anchors, nms_indices) if self._box_rep == 'box_3d': top_orientations = tf.gather( all_orientations, nms_indices) elif self._box_rep in ['box_8c', 'box_8co']: top_prediction_boxes_3d = tf.gather( prediction_boxes_3d, nms_indices) top_prediction_boxes_8c = tf.gather( prediction_boxes_8c, nms_indices) elif self._box_rep == 'box_4c': top_prediction_boxes_3d = tf.gather( prediction_boxes_3d, nms_indices) top_prediction_boxes_4c = tf.gather( prediction_boxes_4c, nms_indices) elif self._box_rep == 'box_4ca': top_prediction_boxes_3d = tf.gather( prediction_boxes_3d, nms_indices) top_prediction_boxes_4c = tf.gather( prediction_boxes_4c, nms_indices) top_orientations = tf.gather( all_orientations, nms_indices) else: raise NotImplementedError('NMS gather not implemented for', self._box_rep) if self._train_val_test in ['train', 'val']: # Additional entries are added to the shared prediction_dict # Mini batch predictions prediction_dict[self.PRED_MB_CLASSIFICATION_LOGITS] = \ mb_classifications_logits prediction_dict[self.PRED_MB_CLASSIFICATION_SOFTMAX] = \ mb_classifications_softmax prediction_dict[self.PRED_MB_OFFSETS] = mb_offsets # Mini batch ground truth prediction_dict[self.PRED_MB_CLASSIFICATIONS_GT] = \ mb_classification_gt prediction_dict[self.PRED_MB_OFFSETS_GT] = mb_offsets_gt # Top NMS predictions prediction_dict[self.PRED_TOP_CLASSIFICATION_LOGITS] = \ top_classification_logits prediction_dict[self.PRED_TOP_CLASSIFICATION_SOFTMAX] = \ top_classification_softmax prediction_dict[self.PRED_TOP_PREDICTION_ANCHORS] = \ top_prediction_anchors # Mini batch predictions (for debugging) prediction_dict[self.PRED_MB_MASK] = mb_mask # prediction_dict[self.PRED_MB_POS_MASK] = mb_pos_mask prediction_dict[self.PRED_MB_CLASS_INDICES_GT] = \ mb_class_label_indices # All predictions (for debugging) prediction_dict[self.PRED_ALL_CLASSIFICATIONS] = \ all_cls_logits prediction_dict[self.PRED_ALL_OFFSETS] = all_offsets # Path drop masks (for debugging) prediction_dict['bev_mask'] = bev_mask prediction_dict['img_mask'] = img_mask else: # self._train_val_test == 'test' prediction_dict[self.PRED_TOP_CLASSIFICATION_SOFTMAX] = \ top_classification_softmax prediction_dict[self.PRED_TOP_PREDICTION_ANCHORS] = \ top_prediction_anchors if self._box_rep == 'box_3d': prediction_dict[self.PRED_MB_ANCHORS_GT] = mb_anchors_gt prediction_dict[self.PRED_MB_ORIENTATIONS_GT] = mb_orientations_gt prediction_dict[self.PRED_MB_ANGLE_VECTORS] = mb_angle_vectors prediction_dict[self.PRED_TOP_ORIENTATIONS] = top_orientations # For debugging prediction_dict[self.PRED_ALL_ANGLE_VECTORS] = all_angle_vectors elif self._box_rep in ['box_8c', 'box_8co']: prediction_dict[self.PRED_TOP_PREDICTION_BOXES_3D] = \ top_prediction_boxes_3d # Store the corners before converting for visualization purposes prediction_dict[self.PRED_TOP_BOXES_8C] = top_prediction_boxes_8c elif self._box_rep == 'box_4c': prediction_dict[self.PRED_TOP_PREDICTION_BOXES_3D] = \ top_prediction_boxes_3d prediction_dict[self.PRED_TOP_BOXES_4C] = top_prediction_boxes_4c elif self._box_rep == 'box_4ca': if self._train_val_test in ['train', 'val']: prediction_dict[self.PRED_MB_ORIENTATIONS_GT] = \ mb_orientations_gt prediction_dict[self.PRED_MB_ANGLE_VECTORS] = mb_angle_vectors prediction_dict[self.PRED_TOP_PREDICTION_BOXES_3D] = \ top_prediction_boxes_3d prediction_dict[self.PRED_TOP_BOXES_4C] = top_prediction_boxes_4c prediction_dict[self.PRED_TOP_ORIENTATIONS] = top_orientations else: raise NotImplementedError('Prediction dict not implemented for', self._box_rep) # prediction_dict[self.PRED_MAX_IOUS] = max_ious # prediction_dict[self.PRED_ALL_IOUS] = all_ious return prediction_dict