这次是根据官方源码,分析efficientdet模型实现细节。因官方代码基于谷歌的automl框架实现,有很多选项配置,该系列基于默认配置,只关注一些实现细节,一些简单的层会粗略带过,计划分为如下章节:
- Backbone实现细节分析
- BiFPN实现细节分析
- Box与Class回归细节分析
- Loss计算分析
- Optimizer梯度计算分析
官方源码:https://github.com/google/automl
efficientdet模型结构如下:
代码结构如下:
对应代码位于efficientdet/keras/efficientdet_keras.py:
class EfficientDetNet(tf.keras.Model):
"""EfficientDet keras network without pre/post-processing."""
'''EfficientDet模型主体'''
def __init__(self, model_name=None, config=None, name=''):
"""Initialize model."""
super().__init__(name=name)
# 根据模型名称,取默认配置hparams_config.get_efficientdet_config
config = config or hparams_config.get_efficientdet_config(model_name)
self.config = config
# Backbone.
# 主体名称,默认:efficientnet-b1
backbone_name = config.backbone_name
# is_training_bn:True
is_training_bn = config.is_training_bn
if 'efficientnet' in backbone_name:
# utils.batch_norm_class,根据硬件选择BatchNormalization
# utils.activation_fn,选择激活函数,默认swish
override_params = {
'batch_norm':
utils.batch_norm_class(is_training_bn, config.strategy),
'relu_fn':
functools.partial(utils.activation_fn, act_type=config.act_type),
}
if 'b0' in backbone_name:
override_params['survival_prob'] = 0.0
# backbone_config:None
if config.backbone_config is not None:
override_params['blocks_args'] = (
efficientnet_builder.BlockDecoder().encode(
config.backbone_config.blocks))
# data_format:channels_last,通道在最后一个维度
override_params['data_format'] = config.data_format
# 构建主体
self.backbone = backbone_factory.get_model(
backbone_name, override_params=override_params)
# Feature network.
# 特征网络
self.resample_layers = [] # additional resampling layers.
# 共7层,主体只有5层,补了两层ResampleFeatureMap
for level in range(6, config.max_level + 1):
# Adds a coarser level by downsampling the last feature map.
self.resample_layers.append(
ResampleFeatureMap(
feat_level=(level - config.min_level),
target_num_channels=config.fpn_num_filters,
# apply_bn_for_resampling:True
apply_bn=config.apply_bn_for_resampling,
# is_training_bn:True
is_training_bn=config.is_training_bn,
# conv_after_downsample:False
conv_after_downsample=config.conv_after_downsample,
# strategy:None
strategy=config.strategy,
data_format=config.data_format,
name='resample_p%d' % level,
))
# 这里是
self.fpn_cells = FPNCells(config)
# class/box output prediction network.
# aspect_ratios:3种形状,3种大小,共9个候选框
num_anchors = len(config.aspect_ratios) * config.num_scales
# fpn通道数
num_filters = config.fpn_num_filters
for head in config.heads:
if head == 'object_detection':
# 目标检测
self.class_net = ClassNet(
num_classes=config.num_classes,
num_anchors=num_anchors,
num_filters=num_filters,
min_level=config.min_level,
max_level=config.max_level,
is_training_bn=config.is_training_bn,
act_type=config.act_type,
repeats=config.box_class_repeats,
separable_conv=config.separable_conv,
survival_prob=config.survival_prob,
strategy=config.strategy,
data_format=config.data_format)
self.box_net = BoxNet(
num_anchors=num_anchors,
num_filters=num_filters,
min_level=config.min_level,
max_level=config.max_level,
is_training_bn=config.is_training_bn,
act_type=config.act_type,
repeats=config.box_class_repeats,
separable_conv=config.separable_conv,
survival_prob=config.survival_prob,
strategy=config.strategy,
data_format=config.data_format)
if head == 'segmentation':
# 目标分割
self.seg_head = SegmentationHead(
num_classes=config.seg_num_classes,
num_filters=num_filters,
min_level=config.min_level,
max_level=config.max_level,
is_training_bn=config.is_training_bn,
act_type=config.act_type,
strategy=config.strategy,
data_format=config.data_format)
def _init_set_name(self, name, zero_based=True):
"""A hack to allow empty model name for legacy checkpoint compitability."""
if name == '': # pylint: disable=g-explicit-bool-comparison
self._name = name
else:
self._name = super().__init__(name, zero_based)
def call(self, inputs, training):
config = self.config
# call backbone network.
all_feats = self.backbone(inputs, training=training, features_only=True)
# 5层特征
feats = all_feats[config.min_level:config.max_level + 1]
# Build additional input features that are not from backbone.
for resample_layer in self.resample_layers:
feats.append(resample_layer(feats[-1], training, None))
# call feature network.
fpn_feats = self.fpn_cells(feats, training)
# call class/box/seg output network.
outputs = []
if 'object_detection' in config.heads:
class_outputs = self.class_net(fpn_feats, training)
box_outputs = self.box_net(fpn_feats, training)
outputs.extend([class_outputs, box_outputs])
if 'segmentation' in config.heads:
seg_outputs = self.seg_head(fpn_feats, training)
outputs.append(seg_outputs)
return tuple(outputs)
Backbone主体部分
MBConvBlock一共有7层,参数分别是:
- r1_k3_s11_e1_i32_o16_se0.25
- r2_k3_s22_e6_i16_o24_se0.25
- r2_k5_s22_e6_i24_o40_se0.25
- r3_k3_s22_e6_i40_o80_se0.25
- r3_k5_s11_e6_i80_o112_se0.25
- r4_k5_s22_e6_i112_o192_se0.25
- r1_k3_s11_e6_i192_o320_se0.25
r:重复次数,k:卷积核大小,s:步长1*1,e:扩大层倍数,与瓶颈层相反(输入通道数的百分比,为1时无操作)
i:输入通道数,o:输出通道数,se:瓶颈层倍数(输出通道数的百分比)
最终参数会根据模型进行4个维度的缩放,4个维度分别是:宽度系数(width_coefficient),深度系数(depth_coefficient),分辨率(resolution),dropout比例(dropout_rate)
结构如下,其中橙色层数后的特征,用于最后回归结果:
具体代码位于efficientdet/backbone/efficientnet_model.py 文件,具体如下:
class Model(tf.keras.Model):
"""A class implements tf.keras.Model.
Reference: https://arxiv.org/abs/1807.11626
"""
def __init__(self, blocks_args=None, global_params=None, name=None):
"""Initializes an `Model` instance.
Args:
blocks_args: A list of BlockArgs to construct block modules.
global_params: GlobalParams, a set of global parameters.
name: A string of layer name.
Raises:
ValueError: when blocks_args is not specified as a list.
"""
super().__init__(name=name)
if not isinstance(blocks_args, list):
raise ValueError('blocks_args should be a list.')
self._global_params = global_params
self._blocks_args = blocks_args
self._relu_fn = global_params.relu_fn or tf.nn.swish
self._batch_norm = global_params.batch_norm
# fix_head_stem:None
self._fix_head_stem = global_params.fix_head_stem
self.endpoints = None
self._build()
def _get_conv_block(self, conv_type):
conv_block_map = {0: MBConvBlock, 1: MBConvBlockWithoutDepthwise}
return conv_block_map[conv_type]
def _build(self):
"""Builds a model."""
self._blocks = []
# Stem part.
# input_filters:32
self._stem = Stem(self._global_params, self._blocks_args[0].input_filters)
# Builds blocks.
# 迭代器,从0开始,返回结果不断加1
block_id = itertools.count(0)
block_name = lambda: 'blocks_%d' % next(block_id)
# _blocks_args默认值
# _blocks_args = [
# 'r1_k3_s11_e1_i32_o16_se0.25', 'r2_k3_s22_e6_i16_o24_se0.25',
# 'r2_k5_s22_e6_i24_o40_se0.25', 'r3_k3_s22_e6_i40_o80_se0.25',
# 'r3_k5_s11_e6_i80_o112_se0.25', 'r4_k5_s22_e6_i112_o192_se0.25',
# 'r1_k3_s11_e6_i192_o320_se0.25',
# ]
for i, block_args in enumerate(self._blocks_args):
assert block_args.num_repeat > 0
# block_args.super_pixel默认为0
assert block_args.super_pixel in [0, 1, 2]
# Update block input and output filters based on depth multiplier.
# 维度缩放
input_filters = round_filters(block_args.input_filters,
self._global_params)
output_filters = round_filters(block_args.output_filters,
self._global_params)
kernel_size = block_args.kernel_size
if self._fix_head_stem and (i == 0 or i == len(self._blocks_args) - 1):
repeats = block_args.num_repeat
else:
# 默认采用,按深度系数*默认重复次数,计算出实际重复次数
repeats = round_repeats(block_args.num_repeat, self._global_params)
# 更新每个block的参数,为实际数
block_args = block_args._replace(
input_filters=input_filters,
output_filters=output_filters,
num_repeat=repeats)
# The first block needs to take care of stride and filter size increase.
# conv_type默认为0
conv_block = self._get_conv_block(block_args.conv_type)
# 默认True
if not block_args.super_pixel: # no super_pixel at all
self._blocks.append(
conv_block(block_args, self._global_params, name=block_name()))
else:
# if superpixel, adjust filters, kernels, and strides.
depth_factor = int(4 / block_args.strides[0] / block_args.strides[1])
block_args = block_args._replace(
input_filters=block_args.input_filters * depth_factor,
output_filters=block_args.output_filters * depth_factor,
kernel_size=((block_args.kernel_size + 1) //
2 if depth_factor > 1 else block_args.kernel_size))
# if the first block has stride-2 and super_pixel trandformation
if (block_args.strides[0] == 2 and block_args.strides[1] == 2):
block_args = block_args._replace(strides=[1, 1])
self._blocks.append(
conv_block(block_args, self._global_params, name=block_name()))
block_args = block_args._replace( # sp stops at stride-2
super_pixel=0,
input_filters=input_filters,
output_filters=output_filters,
kernel_size=kernel_size)
elif block_args.super_pixel == 1:
self._blocks.append(
conv_block(block_args, self._global_params, name=block_name()))
block_args = block_args._replace(super_pixel=2)
else:
self._blocks.append(
conv_block(block_args, self._global_params, name=block_name()))
# 重复次数大于1,后面的重复层输入维度等于输出维度,strides=[1, 1]
if block_args.num_repeat > 1: # rest of blocks with the same block_arg
# pylint: disable=protected-access
block_args = block_args._replace(
input_filters=block_args.output_filters, strides=[1, 1])
# pylint: enable=protected-access
for _ in xrange(block_args.num_repeat - 1):
self._blocks.append(
conv_block(block_args, self._global_params, name=block_name()))
# Head part.
self._head = Head(self._global_params)
def call(self,
inputs,
training,
features_only=None,
pooled_features_only=False):
"""Implementation of call().
Args:
inputs: input tensors.
training: boolean, whether the model is constructed for training.
features_only: build the base feature network only.
pooled_features_only: build the base network for features extraction
(after 1x1 conv layer and global pooling, but before dropout and fc
head).
Returns:
output tensors.
"""
outputs = None
self.endpoints = {}
reduction_idx = 0
# Calls Stem layers
outputs = self._stem(inputs, training)
logging.info('Built stem %s : %s', self._stem.name, outputs.shape)
self.endpoints['stem'] = outputs
# Calls blocks.
for idx, block in enumerate(self._blocks):
is_reduction = False # reduction flag for blocks after the stem layer
# If the first block has super-pixel (space-to-depth) layer, then stem is
# the first reduction point.
# False
if (block.block_args.super_pixel == 1 and idx == 0):
reduction_idx += 1
self.endpoints['reduction_%s' % reduction_idx] = outputs
elif ((idx == len(self._blocks) - 1) or
self._blocks[idx + 1].block_args.strides[0] > 1):
# 最后一层或strides>1时
is_reduction = True
reduction_idx += 1
# survival_prob:None
survival_prob = self._global_params.survival_prob
if survival_prob:
drop_rate = 1.0 - survival_prob
survival_prob = 1.0 - drop_rate * float(idx) / len(self._blocks)
logging.info('block_%s survival_prob: %s', idx, survival_prob)
# 调用块
outputs = block(outputs, training=training, survival_prob=survival_prob)
# 记录每个block结果
self.endpoints['block_%s' % idx] = outputs
if is_reduction:
print('block.block_args:', block.block_args)
# 记录每个block维度下降后的结果
self.endpoints['reduction_%s' % reduction_idx] = outputs
if block.endpoints:
# 记录每个block的子结果
for k, v in six.iteritems(block.endpoints):
self.endpoints['block_%s/%s' % (idx, k)] = v
if is_reduction:
self.endpoints['reduction_%s/%s' % (reduction_idx, k)] = v
self.endpoints['features'] = outputs
if not features_only:
# Calls final layers and returns logits.
# 调用head层,把结果加入
outputs = self._head(outputs, training, pooled_features_only)
self.endpoints.update(self._head.endpoints)
return [outputs] + list(filter(lambda endpoint: endpoint is not None, [
self.endpoints.get('reduction_1'),
self.endpoints.get('reduction_2'),
self.endpoints.get('reduction_3'),
self.endpoints.get('reduction_4'),
self.endpoints.get('reduction_5'),
]))
Stem结构如下:
代码如下:
class Stem(tf.keras.layers.Layer):
"""Stem layer at the begining of the network."""
def __init__(self, global_params, stem_filters, name=None):
super().__init__(name=name)
self._conv_stem = tf.keras.layers.Conv2D(
# (stem_filters * width_coefficient + 4) // 64
filters=round_filters(stem_filters, global_params,
global_params.fix_head_stem),
kernel_size=[3, 3],
strides=[2, 2],
kernel_initializer=conv_kernel_initializer,
padding='same',
data_format=global_params.data_format,
use_bias=False)
self._bn = global_params.batch_norm(
axis=(1 if global_params.data_format == 'channels_first' else -1),
momentum=global_params.batch_norm_momentum,
epsilon=global_params.batch_norm_epsilon)
self._relu_fn = global_params.relu_fn or tf.nn.swish
def call(self, inputs, training):
return self._relu_fn(self._bn(self._conv_stem(inputs), training=training))
MBConvBlock结构如图,里面有个判断,在第一层少一个Conv:
代码如下:
class MBConvBlock(tf.keras.layers.Layer):
"""A class of MBConv: Mobile Inverted Residual Bottleneck.
Attributes:
endpoints: dict. A list of internal tensors.
"""
def __init__(self, block_args, global_params, name=None):
"""Initializes a MBConv block.
Args:
block_args: BlockArgs, arguments to create a Block.
global_params: GlobalParams, a set of global parameters.
name: layer name.
"""
super().__init__(name=name)
self._block_args = block_args
self._global_params = global_params
# local_pooling默认为False
self._local_pooling = global_params.local_pooling
self._batch_norm_momentum = global_params.batch_norm_momentum
self._batch_norm_epsilon = global_params.batch_norm_epsilon
self._batch_norm = global_params.batch_norm
# condconv_num_experts默认为None
self._condconv_num_experts = global_params.condconv_num_experts
self._data_format = global_params.data_format
self._channel_axis = 1 if self._data_format == 'channels_first' else -1
self._relu_fn = global_params.relu_fn or tf.nn.swish
# 默认True
self._has_se = (
global_params.use_se and self._block_args.se_ratio is not None and
0 < self._block_args.se_ratio <= 1)
# 默认False
self._clip_projection_output = global_params.clip_projection_output
self.endpoints = None
# 默认False
if self._block_args.condconv:
raise ValueError('Condconv is not supported.')
# Builds the block accordings to arguments.
self._build()
@property
def block_args(self):
return self._block_args
def _build(self):
"""Builds block according to the arguments."""
# pylint: disable=g-long-lambda
bid = itertools.count(0)
# 这里一个方法调用了两次next(bid),因此要除以2
get_bn_name = lambda: 'tpu_batch_normalization' + ('' if not next(
bid) else '_' + str(next(bid) // 2))
cid = itertools.count(0)
get_conv_name = lambda: 'conv2d' + ('' if not next(cid) else '_' + str(
next(cid) // 2))
# pylint: enable=g-long-lambda
# self._block_args.super_pixel默认为0
if self._block_args.super_pixel == 1:
self.super_pixel = SuperPixel(
self._block_args, self._global_params, name='super_pixel')
else:
self.super_pixel = None
# self._block_args.expand_ratio扩大倍率
filters = self._block_args.input_filters * self._block_args.expand_ratio
kernel_size = self._block_args.kernel_size
# self._block_args.fused_conv默认为0
# False
if self._block_args.fused_conv:
# Fused expansion phase. Called if using fused convolutions.
self._fused_conv = tf.keras.layers.Conv2D(
filters=filters,
kernel_size=[kernel_size, kernel_size],
strides=self._block_args.strides,
kernel_initializer=conv_kernel_initializer,
padding='same',
data_format=self._data_format,
use_bias=False,
name=get_conv_name())
else:
# Expansion phase. Called if not using fused convolutions and expansion
# phase is necessary.
# 第一层不执行
if self._block_args.expand_ratio != 1:
self._expand_conv = tf.keras.layers.Conv2D(
filters=filters,
kernel_size=[1, 1],
strides=[1, 1],
kernel_initializer=conv_kernel_initializer,
padding='same',
data_format=self._data_format,
use_bias=False,
name=get_conv_name())
self._bn0 = self._batch_norm(
axis=self._channel_axis,
momentum=self._batch_norm_momentum,
epsilon=self._batch_norm_epsilon,
name=get_bn_name())
# Depth-wise convolution phase. Called if not using fused convolutions.
# 深度卷积输出维度与输入维度一样,每个输入经过一个卷积核得到一个输出
self._depthwise_conv = tf.keras.layers.DepthwiseConv2D(
kernel_size=[kernel_size, kernel_size],
strides=self._block_args.strides,
depthwise_initializer=conv_kernel_initializer,
padding='same',
data_format=self._data_format,
use_bias=False,
name='depthwise_conv2d')
self._bn1 = self._batch_norm(
axis=self._channel_axis,
momentum=self._batch_norm_momentum,
epsilon=self._batch_norm_epsilon,
name=get_bn_name())
# True
if self._has_se:
num_reduced_filters = max(
1, int(self._block_args.input_filters * self._block_args.se_ratio))
self._se = SE(
self._global_params, num_reduced_filters, filters, name='se')
else:
self._se = None
# Output phase.
filters = self._block_args.output_filters
self._project_conv = tf.keras.layers.Conv2D(
filters=filters,
kernel_size=[1, 1],
strides=[1, 1],
kernel_initializer=conv_kernel_initializer,
padding='same',
data_format=self._data_format,
use_bias=False,
name=get_conv_name())
self._bn2 = self._batch_norm(
axis=self._channel_axis,
momentum=self._batch_norm_momentum,
epsilon=self._batch_norm_epsilon,
name=get_bn_name())
def call(self, inputs, training, survival_prob=None):
"""Implementation of call().
Args:
inputs: the inputs tensor.
training: boolean, whether the model is constructed for training.
survival_prob: float, between 0 to 1, drop connect rate.
Returns:
A output tensor.
"""
logging.info('Block %s input shape: %s', self.name, inputs.shape)
x = inputs
# creates conv 2x2 kernel
# None
if self.super_pixel:
x = self.super_pixel(x, training)
logging.info('SuperPixel %s: %s', self.name, x.shape)
# None
if self._block_args.fused_conv:
# If use fused mbconv, skip expansion and use regular conv.
x = self._relu_fn(self._bn1(self._fused_conv(x), training=training))
logging.info('Conv2D shape: %s', x.shape)
else:
# Otherwise, first apply expansion and then apply depthwise conv.
# 第一层不执行
if self._block_args.expand_ratio != 1:
x = self._relu_fn(self._bn0(self._expand_conv(x), training=training))
logging.info('Expand shape: %s', x.shape)
x = self._relu_fn(self._bn1(self._depthwise_conv(x), training=training))
logging.info('DWConv shape: %s', x.shape)
# True
if self._se:
x = self._se(x)
self.endpoints = {'expansion_output': x}
x = self._bn2(self._project_conv(x), training=training)
# Add identity so that quantization-aware training can insert quantization
# ops correctly.
# tf.identity这里相当与克隆
x = tf.identity(x)
# False
if self._clip_projection_output:
x = tf.clip_by_value(x, -6, 6)
# False
if self._block_args.id_skip:
if all(
s == 1 for s in self._block_args.strides
) and self._block_args.input_filters == self._block_args.output_filters:
# Apply only if skip connection presents.
if survival_prob:
x = utils.drop_connect(x, training, survival_prob)
x = tf.add(x, inputs)
logging.info('Project shape: %s', x.shape)
return x
SE结构图:
代码如下:
class SE(tf.keras.layers.Layer):
"""Squeeze-and-excitation layer."""
def __init__(self, global_params, se_filters, output_filters, name=None):
super().__init__(name=name)
# local_pooling默认为False
self._local_pooling = global_params.local_pooling
self._data_format = global_params.data_format
self._relu_fn = global_params.relu_fn or tf.nn.swish
# Squeeze and Excitation layer.
self._se_reduce = tf.keras.layers.Conv2D(
se_filters,
kernel_size=[1, 1],
strides=[1, 1],
kernel_initializer=conv_kernel_initializer,
padding='same',
data_format=self._data_format,
use_bias=True,
name='conv2d')
self._se_expand = tf.keras.layers.Conv2D(
output_filters,
kernel_size=[1, 1],
strides=[1, 1],
kernel_initializer=conv_kernel_initializer,
padding='same',
data_format=self._data_format,
use_bias=True,
name='conv2d_1')
def call(self, inputs):
h_axis, w_axis = [2, 3] if self._data_format == 'channels_first' else [1, 2]
if self._local_pooling:
se_tensor = tf.nn.avg_pool(
inputs,
ksize=[1, inputs.shape[h_axis], inputs.shape[w_axis], 1],
strides=[1, 1, 1, 1],
padding='VALID')
else:
se_tensor = tf.reduce_mean(inputs, [h_axis, w_axis], keepdims=True)
se_tensor = self._se_expand(self._relu_fn(self._se_reduce(se_tensor)))
logging.info('Built SE %s : %s', self.name, se_tensor.shape)
return tf.sigmoid(se_tensor) * inputs
到此本章内容结束,下一章,Box与Class回归细节分析,敬请关注!!!