1. ResNeXt网络
即在 bottle Neck 模块的基础上, 实现了每个模块的多分支的并行结构;
2.1 代码实现
# author: contact {chuyunxinlan at gmail dot com}
# time: 2023/3/24
# 下午6:36
# reference the official pytorch1.7.1 code
import torch.nn as nn
import torch.nn.functional as F
def conv3x3(in_plances, out_plances, stride=1, groups=1, dilation=1 ):
'3*3 使用padding, groups=1 不使用分组卷积'
return nn.Conv2d(in_plances, out_plances, kernel_size=3, stride=stride,
padding=dilation, groups=groups, bias=False, dilation=dilation)
def conv1x1(in_plances, out_plances, stride =1):
'1x1 conv, 默认 padding = 0'
return nn.Conv2d(in_plances, out_plances, kernel_size=1, stride=stride, bias=False)
class Bottleneck(nn.Module):
"""
原始的bottleneck, 1x1--> 3x3 --> 1x1, 用于将通道进行降维-》保持通道维度 --> 升维度
"Torchvision中的 bottleneck将下采样的步幅置于3x3卷积(self.conv2),
而原始实现将步幅置于第一个1x1卷积(self.conv1)
根据“图像识别的深度残差学习”https:arxiv.orgabs1512.03385。
此变体也称为 ResNet V1.5,
根据 https:ngc.nvidia.comcatalogmodel-scriptsnvidia:resnet_50_v1_5_for_pytorch 提高了准确性。"
"""
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsanple=None, groups=1,
base_width=64, dilation=1, norm_layer=None, ):
super(Bottleneck, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
# width :1x1 卷积层的输出通道数,用于扩充通道维数;, base width 基础宽度;
width = int(planes * (base_width / 64.)) * groups
# note: bottle neck
# 当输入步长!=1 时, self.conv2 和 self.downsample 将输入进行下采样;
self.conv1 = conv1x1(inplanes, width)
self.bn1 = norm_layer(width)
self.conv2 = conv3x3(width, width, stride, groups, dilation)
self.bn2 = norm_layer(width)
# width 扩充通道维数到 planes * self.expansion
self.conv3 = conv1x1(width, planes * self.expansion)
self.bn3 = norm_layer(planes * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsanple
self.stride = stride
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out =self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
import torch
# note : construct the network
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes = 4, zero_init_residual=False,
groups=1, width_per_group=64, replace_stride_with_dilation=None,
norm_layer=None):
super(ResNet, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
self._norm_layer = norm_layer
self.inplanes = 64
self.dilation = 1
if replace_stride_with_dilation is None:
# 元组中的每个元素都指示我们是否应该用 空洞卷积代替 2x2 步幅,
replace_stride_with_dilation = [ False, False, False]
if len(replace_stride_with_dilation) != 3:
# 确保输入的替代步长的的元素是3个;
raise ValueError(" replce_stride_with_dilation should be None"
"or a 3-element tuple ,got {}".format(replace_stride_with_dilation))
self.groups = groups
self.base_width = width_per_group # 设置每组中的通道数目;
# note: 此处用于修改, 网络的输入通道数;
self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = norm_layer(self.inplanes)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128,layers[1],
stride=2, dilate=replace_stride_with_dilation[0])
self.layer3 = self._make_layer(block, 256,layers[2],
stride=2, dilate=replace_stride_with_dilation[1])
self.layer4 = self._make_layer(block, 512, layers[3],
stride=2, dilate=replace_stride_with_dilation[2])
self.avgpool = nn.AdaptiveAvgPool2d( (1,1))
self.fc = nn.Linear(512 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
# Zero-initialize the last BN in each residual branch,
# so that the residual branch starts with zeros, and each residual block behaves like an identity.
# This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
if zero_init_residual:
for m in self.modules():
if isinstance(m, Bottleneck):
nn.init.constant_(m.bn3.weight, 0)
# elif isinstance(m, BasicBlock):
# nn.init.constant_(m.bn2.weight, 0)
def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
# block 第一个block 是规定的基本的模块是选中 basic_block or BottleNeck
# 第二个block 该基础模块堆叠, 多少次;
norm_layer = self._norm_layer
downsample = None
previous_dilation = self.dilation
if dilate:
self.dilation *= stride
stride = 1
# note 如果步长不为1, 或者 inplances 和扩充的通道维数不相同, 则使用1x1卷积下采样
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
conv1x1(self.inplanes, planes * block.expansion, stride),
norm_layer(planes * block.expansion),
)
layers = []
# def __init__(self, inplanes, planes, stride=1, downsanple=None, groups=1,
# base_width=64, dilation=1, norm_layer=None, ):
# note 先确定本层中的 第一个模块, 使用给定的参数,初始化第一个模块;
layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
self.base_width, previous_dilation, norm_layer))
# note: 每层中所有bottle Neck, 第一个bottleNect中输入通道数是给定的输入通道数,
# 其他的bottleNeck 输入通道数 = 给定通道数 × expansion倍数;
# note : 这里将后续的每个block 中的输入通道数 扩充 expansion 倍数;
self.inplanes = planes * block.expansion
# note: 这里从1开始, 按照给定的blocks, 确定本层堆叠几次block,
for _ in range(1, blocks):
layers.append(block(self.inplanes, planes, groups = self.groups,
base_width=self.base_width, dilation= self.dilation,
norm_layer=norm_layer))
return nn.Sequential(*layers)
def _forward_impl(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.fc(x)
return x
def forward(self, x):
return self._forward_impl(x)
def _resnet(arch, block, layers, pretrained, progress, **kwargs):
# arch 用于下载该模型对应的权重,
# 实例化该网络模型;
model = ResNet(block, layers, **kwargs)
if pretrained:
pass
# state_dict = load_state_dict_from_url(model_urls[arch],
# progress=progress)
# model.load_state_dict(state_dict)
return model
def resnext50_32x4d(pretrained=False, progress=True, **kwargs):
kwargs['groups'] = 32 # 该参数用于将输入的通道数进行分组, 分成的组成也是 分支数,
kwargs['width_per_group'] = 4 # 规定每组中卷积核的的数目;
# 列表中的个数, 代表堆叠多少层的layer, 其中的每一项代表该层堆叠多少个BottleNect;
return _resnet('resnext50_32x4d', Bottleneck, [3,4, 6, 3],
pretrained, progress, **kwargs)
if __name__ == "__main__":
model_resxt = resnext50_32x4d(num_classes=4,)
image = torch.rand(8, 3, 96, 510)
out = mode
2.2 网络结构
ResNet(
(conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
(layer1): Sequential(
(0): Bottleneck(
(conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(1): Bottleneck(
(conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
(2): Bottleneck(
(conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
)
(layer2): Sequential(
(0): Bottleneck(
(conv1): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(1): Bottleneck(
(conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
(2): Bottleneck(
(conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
(3): Bottleneck(
(conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
)
(layer3): Sequential(
(0): Bottleneck(
(conv1): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(1): Bottleneck(
(conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
(2): Bottleneck(
(conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
(3): Bottleneck(
(conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
(4): Bottleneck(
(conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
(5): Bottleneck(
(conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
)
(layer4): Sequential(
(0): Bottleneck(
(conv1): Conv2d(1024, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(1): Bottleneck(
(conv1): Conv2d(2048, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
(2): Bottleneck(
(conv1): Conv2d(2048, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
)
(avgpool): AdaptiveAvgPool2d(output_size=(1, 1))
(fc): Linear(in_features=2048, out_features=4, bias=True)
)
2. 自主构建的网络
主要的结构,是前期使用了inverted_residual 结构,
目的是为了不让各个通道之间的信息,交流。
保持原始的通道信息,
这样一直到ResXt的网络结构时, 才开始通道交融;
该网络,由于采用了多分支并行的结构,导致了该网络不能够搭建的很深
在( 2, 9, 576, 600) 的情况下,占用了20G的显存, 100G的 内存;
2.1 网络结构
DepthSepResXt(
(input_steam): InputSteam(
(features): Sequential(
(0): ConvNormActivation(
(0): Conv2d(9, 18, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=9, bias=False)
(1): BatchNorm2d(18, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
(2): Hardswish()
)
(1): InvertedResidual(
(block): Sequential(
(0): ConvNormActivation(
(0): Conv2d(18, 18, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=18, bias=False)
(1): BatchNorm2d(18, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
)
(1): SqueezeExcitation(
(avgpool): AdaptiveAvgPool2d(output_size=1)
(fc1): Conv2d(18, 9, kernel_size=(1, 1), stride=(1, 1))
(fc2): Conv2d(9, 18, kernel_size=(1, 1), stride=(1, 1))
(activation): ReLU()
(scale_activation): Hardsigmoid()
)
(2): ConvNormActivation(
(0): Conv2d(18, 18, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(18, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
)
)
)
(2): InvertedResidual(
(block): Sequential(
(0): ConvNormActivation(
(0): Conv2d(18, 36, kernel_size=(1, 1), stride=(1, 1), groups=18, bias=False)
(1): BatchNorm2d(36, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
)
(1): ConvNormActivation(
(0): Conv2d(36, 36, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=36, bias=False)
(1): BatchNorm2d(36, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
)
(2): SqueezeExcitation(
(avgpool): AdaptiveAvgPool2d(output_size=1)
(fc1): Conv2d(36, 18, kernel_size=(1, 1), stride=(1, 1))
(fc2): Conv2d(18, 36, kernel_size=(1, 1), stride=(1, 1))
(activation): ReLU()
(scale_activation): Hardsigmoid()
)
(3): ConvNormActivation(
(0): Conv2d(36, 36, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(36, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
)
)
)
(3): InvertedResidual(
(block): Sequential(
(0): ConvNormActivation(
(0): Conv2d(36, 72, kernel_size=(1, 1), stride=(1, 1), groups=36, bias=False)
(1): BatchNorm2d(72, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
)
(1): ConvNormActivation(
(0): Conv2d(72, 72, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=72, bias=False)
(1): BatchNorm2d(72, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
(2): ReLU(inplace=True)
)
(2): SqueezeExcitation(
(avgpool): AdaptiveAvgPool2d(output_size=1)
(fc1): Conv2d(72, 36, kernel_size=(1, 1), stride=(1, 1))
(fc2): Conv2d(36, 72, kernel_size=(1, 1), stride=(1, 1))
(activation): ReLU()
(scale_activation): Hardsigmoid()
)
(3): ConvNormActivation(
(0): Conv2d(72, 72, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(72, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
)
)
)
)
)
(layer1): Sequential(
(0): Bottleneck(
(conv1): Conv2d(72, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(128, 144, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(72, 144, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(1): Bottleneck(
(conv1): Conv2d(144, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(128, 144, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
)
(layer2): Sequential(
(0): Bottleneck(
(conv1): Conv2d(144, 224, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(224, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(224, 224, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(224, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(224, 236, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(236, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(144, 236, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(236, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(1): Bottleneck(
(conv1): Conv2d(236, 224, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(224, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(224, 224, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(224, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(224, 236, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(236, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
)
(layer3): Sequential(
(0): Bottleneck(
(conv1): Conv2d(236, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(236, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(1): Bottleneck(
(conv1): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
)
(layer4): Sequential(
(0): Bottleneck(
(conv1): Conv2d(256, 288, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(288, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(288, 288, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(288, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(288, 312, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(312, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(256, 312, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(312, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
(1): Bottleneck(
(conv1): Conv2d(312, 288, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(288, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(288, 288, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(288, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(288, 312, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(312, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
)
)
(layer5): Sequential(
(0): Bottleneck(
(conv1): Conv2d(312, 352, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(352, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(352, 352, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(352, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(352, 372, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(372, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(312, 372, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(372, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
)
(layer6): Sequential(
(0): Bottleneck(
(conv1): Conv2d(372, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(384, 384, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(384, 412, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(412, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(372, 412, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(412, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
)
(layer7): Sequential(
(0): Bottleneck(
(conv1): Conv2d(412, 448, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(448, 448, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(448, 452, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(452, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(412, 452, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(452, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
)
(layer8): Sequential(
(0): Bottleneck(
(conv1): Conv2d(452, 448, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(448, 448, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(448, 472, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(472, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(452, 472, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(472, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
)
(layer9): Sequential(
(0): Bottleneck(
(conv1): Conv2d(472, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(480, 480, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(480, 492, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(492, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(472, 492, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(492, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
)
(layer10): Sequential(
(0): Bottleneck(
(conv1): Conv2d(492, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
(bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(492, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
)
)
(fc): Sequential(
(0): Dropout(p=0.1, inplace=False)
(1): Linear(in_features=2048, out_features=128, bias=True)
(2): ReLU(inplace=True)
(3): Dropout(p=0.1, inplace=False)
(4): Linear(in_features=128, out_features=128, bias=True)
(5): ReLU(inplace=True)
)
(cls_fc): Linear(in_features=128, out_features=4, bias=True)
)
- 上述网络结构,是通过如下代码实现的:
# author: Chu Yun, contact {chuyunxinlan at gmail dot com}
# time: 2023/3/25
# 下午4:06
# 该网络实现了,
# 1. 先使用inverted_residual 模块,使用通道深度可分离的操作,
# 将输入的通道数扩充到64个通道, 并且这个过程中保持特征图大小不变;
# 2. invted_residual 模块中,使用了 se_layer 通道注意力机制, 用于筛选通道;
# 3. 将扩充的64通道对接到 ResXt的主干网络中去;
from typing import Any, Callable, List, Optional, Sequence
from types import FunctionType
import torch
from torch import Tensor
# author: Chu Yun, contact {chuyunxinlan at gmail dot com}
# time: 2023/3/21
# 下午6:18
# reference the official pytorch code
from typing import Any, Callable, List, Optional, Sequence
from types import FunctionType
import torch
from torch import Tensor
def _log_api_usage_once(obj: Any) -> None:
"""
Logs API usage(module and name) within an organization.
In a large ecosystem, it's often useful to track the PyTorch and
TorchVision APIs usage. This API provides the similar functionality to the
logging module in the Python stdlib. It can be used for debugging purpose
to log which methods are used and by default it is inactive, unless the user
manually subscribes a logger via the `SetAPIUsageLogger method <https://github.com/pytorch/pytorch/blob/eb3b9fe719b21fae13c7a7cf3253f970290a573e/c10/util/Logging.cpp#L114>`_.
Please note it is triggered only once for the same API call within a process.
It does not collect any data from open-source users since it is no-op by default.
For more information, please refer to
* PyTorch note: https://pytorch.org/docs/stable/notes/large_scale_deployments.html#api-usage-logging;
* Logging policy: https://github.com/pytorch/vision/issues/5052;
Args:
obj (class instance or method): an object to extract info from.
"""
if not obj.__module__.startswith("torchvision"):
return
name = obj.__class__.__name__
if isinstance(obj, FunctionType):
name = obj.__name__
torch._C._log_api_usage_once(f"{obj.__module__}.{name}")
# 用于确保输入的通道数 是最靠近数的整数倍;
def _make_divisiable(v, divisor, min_value=None):
"""
This function is taken from the original tf repo.
It ensures that all layers have a channel number that is divisible by 8
It can be seen here:
https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
:param v:
:param divisor:
:param min_value:
:return:
"""
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
# Make sure that round down does not go down by more than 10%.
if new_v < 0.9 * v:
new_v += divisor
return new_v
class ConvNormActivation(torch.nn.Sequential):
"""
Configurable block used for Convolution-Normalzation-Activation blocks.
Args:
in_channels (int): Number of channels in the input image
out_channels (int): Number of channels produced by the Convolution-Normalzation-Activation block
kernel_size: (int, optional): Size of the convolving kernel. Default: 3
stride (int, optional): Stride of the convolution. Default: 1
padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in wich case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolutiuon layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm2d``
activation_layer (Callable[..., torch.nn.Module], optinal): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
dilation (int): Spacing between kernel elements. Default: 1
inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
"""
def __init__(
self,
in_channels: int,
out_channels: int,
kernel_size: int = 3,
stride: int = 1,
padding: Optional[int] = None,
groups: int = 1,
norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm2d,
activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
dilation: int = 1,
inplace: Optional[bool] = True,
bias: Optional[bool] = None,
) -> None:
# note: 注意 这里设置的默认padding 并非0, 通常如果kernel =3, 默认padding =1, 如果kernel= 1, padding=0
if padding is None:
padding = (kernel_size - 1) // 2 * dilation
if bias is None:
bias = norm_layer is None
layers = [
torch.nn.Conv2d(
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation=dilation,
groups=groups,
bias=bias,
)
]
if norm_layer is not None:
layers.append(norm_layer(out_channels))
if activation_layer is not None:
params = {} if inplace is None else {"inplace": inplace}
layers.append(activation_layer(**params))
super().__init__(*layers)
_log_api_usage_once(self)
self.out_channels = out_channels
# note 通道注意力模块, 和CBAM 模块的区别,他缺少了空间注意力模块;
class SqueezeExcitation(torch.nn.Module):
"""
This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1).
Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in in eq. 3.
Args:
input_channels (int): Number of channels in the input image
squeeze_channels (int): Number of squeeze channels
activation (Callable[..., torch.nn.Module], optional): ``delta`` activation. Default: ``torch.nn.ReLU``
scale_activation (Callable[..., torch.nn.Module]): ``sigma`` activation. Default: ``torch.nn.Sigmoid``
"""
def __init__(self, input_channels:int, squeeze_channels: int,
activation: Callable[...,torch.nn.Module] = torch.nn.ReLU,
scale_activation: Callable[..., torch.nn.Module] = torch.nn.Sigmoid,
) -> None:
super().__init__()
_log_api_usage_once(self) #记录调用该类的使用情况
self.avgpool = torch.nn.AdaptiveAvgPool2d(1)
# 使用1*1 构成全连接层;
self.fc1 = torch.nn.Conv2d(input_channels, squeeze_channels, 1)
self.fc2 = torch.nn.Conv2d(squeeze_channels, input_channels, 1)
self.activation = activation()
self.scale_activation = scale_activation()
def _scale(self, input:Tensor) ->Tensor:
scale = self.avgpool(input)
scale = self.fc1(scale)
scale = self.activation(scale)
scale = self.fc2(scale)
return self.scale_activation(scale)
def forward(self, input:Tensor) -> Tensor:
scale = self._scale(input)
return scale * input
class InvertedResidualConfig:
# Stores information listed at Tables 1 and 2 of the MobileNetV3 paper
# 用于存储 配置参数信息, 使用不同的参数配置形成不同的对象 inverted Residual 模块
def __init__(self,
input_channels: int, kernel: int,
expanded_channels: int, out_channels: int,
use_se: bool, activation:str,
stride: int, dilation: int,
width_mult: float,
):
self.input_channels = self.adjust_channels(input_channels, width_mult)
self.kernel = kernel
self.expanded_channels = self.adjust_channels(expanded_channels, width_mult)
self.out_channels = self.adjust_channels(out_channels, width_mult)
self.use_se = use_se # 是否使用 se 通道注意力;
self.use_hs = activation == "HS" # 确定激活函数的类型,是否使用 hard sigmoid
self.stride = stride
self.dilation = dilation # 是否开启空洞卷积;
@staticmethod
def adjust_channels(channels:int, width_mult: float):
# 用于调整通道的个数,使得通道数目是最接近规定数值的整数倍附近;
# note: 这里规定了每个模块中, 通道必须满足的倍数;
return _make_divisiable(channels * width_mult, 9)
from torch import nn
from functools import partial
class InvertedResidual(nn.Module):
# Implemented as described at section 5 of MobileNetV3 paper
def __init__(self,
cnf: InvertedResidualConfig,
norm_layer: Callable[..., nn.Module],
se_layer: Callable[..., nn.Module] = partial(SqueezeExcitation, scale_activation = nn.Hardsigmoid)
):
super().__init__()
if not ( 1 <= cnf.stride <= 2): # 限制滑动步长只在1, 2之间;
raise ValueError(" illegal stride value")
# note : 规定了使用残差连接的条件,需要 同时满足 步长为1, 并且 输入通道数和 输出通道数相同
self.use_res_connet = cnf.stride == 1 and cnf.input_channels == cnf.out_channels
layers: List[nn.Module] = []
activation_layer = nn.Hardswish if cnf.use_hs else nn.ReLU
# 1*1 pw: 当输入通道数 和扩展通道数 不相同时, 扩充通道数
if cnf.expanded_channels != cnf.input_channels:
layers.append(
ConvNormActivation(
cnf.input_channels,
cnf.expanded_channels,
kernel_size= 1,
groups=cnf.input_channels,
norm_layer=norm_layer,
activation_layer=activation_layer,
)
)
# depthwise: 使用分组卷积, 将输入通道数进行分组;
# 当使用空洞卷积时, stride = 1; 否则 stride 取给定的值
stride = 1 if cnf.dilation >1 else cnf.stride
layers.append(
ConvNormActivation(
cnf.expanded_channels,
cnf.expanded_channels,
kernel_size= cnf.kernel,
stride=stride,
dilation= cnf.dilation,
groups=cnf.expanded_channels,
norm_layer=norm_layer,
activation_layer= activation_layer,
)
)
if cnf.use_se: # 本层中, 是否引入se 通道注意力模块;
# note: 这里规定了每个模块中, 通道必须满足的倍数;
squeeze_channels = _make_divisiable(cnf.expanded_channels // 2, 9)
# 规定通道注意力层se_layer 中的 扩展通道和挤压通道;
layers.append(se_layer(cnf.expanded_channels, squeeze_channels))
# 降低通道维数 project
layers.append(
ConvNormActivation(
cnf.expanded_channels, cnf.out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=None
)
)
# note self.block 属性
# 将上述 layers 构成的列表, 通过nn.Sequential() 形成网络中基本模块, 在使用该基本模块构建网络的各个层;
self.block = nn.Sequential(*layers)
# 这两个属性貌似没有用到;
self.out_channels = cnf.out_channels
self._is_cn = cnf.stride > 1
def forward(self, input: Tensor) -> Tensor:
result = self.block(input)
if self.use_res_connet: # 确认是否使用残差连接;
result += input
return result
# note : 使用深度可分离卷积 和通道注意力将输入通道数扩充
class InputSteam(nn.Module):
def __init__(self,
# 同一个类在实例化,不同的对象时,将不同的参数以列表的形式给出;
inverted_residual_setting:List[InvertedResidualConfig],
# block 以可迭代的对象传入进来, 并且该可迭代对象是nn.Module 的实例化对象
block: Optional[Callable[..., nn.Module]] = None,
norm_layer: Optional[Callable[..., nn.Module]] = None,
dropout: float = 0.2,
**kwargs: Any, # 使用关键字匹配的形式,来传递多个参数;
) -> None:
"""
Args:
inverted_residual_setting: 以列表的形式传递传递参数,实例化模块, 构建网络的主体结构
last_channel: 倒数第二层的通道数
num_classes:
block: 指定 基本模块, 使用该基本模块来构建网络
norm_layer: normalization layer 归一化层;
dropout: dropout 概率;
**kwargs:
"""
super().__init__()
_log_api_usage_once(self)
if not inverted_residual_setting: # 确定传入配置不为空
raise ValueError( " The inverted residual setting should not be empty")
elif not( # 并且再一次确定,传入的参数配置是以顺序的形式传入进来;
isinstance(inverted_residual_setting, Sequence)
# 其中每一个配置对象, 都是InvertedResidualConfig 类的实例化对象.
and all( [isinstance(s, InvertedResidualConfig) for s in inverted_residual_setting])
): # 否则,报类型错误
raise TypeError ("The inverted residual setting should be List[InvertedResidualConfig]")
if block is None:
block = InvertedResidual
if norm_layer is None:
norm_layer = partial(nn.BatchNorm2d, eps= 0.001, momentum=0.01)
layers: List[nn.Module] = []
# note 构建整个网络结构的第一层;
# note: 此处修改 groups
firstconv_output_channels = inverted_residual_setting[0].input_channels
layers.append(
ConvNormActivation(
9, firstconv_output_channels,
kernel_size=3, stride=1,
groups= 9,
norm_layer=norm_layer, activation_layer=nn.Hardswish,
)
)
# note :使用配置信息,构建网络的主体模块
for cnf in inverted_residual_setting:
layers.append(block (cnf, norm_layer))
# note: 将构成的主干网络结构作为一个属性;
self.features = nn.Sequential(*layers)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode= 'fan_out')# 通过随机矩阵显式创建权重
if m.bias is not None:
nn.init.zeros_(m.bias)
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
nn.init.ones_(m.weight)
nn.init.zeros_(m.bias)
elif isinstance(m, nn.Linear):
nn.init.normal_(m.weight, 0, 0.01)
nn.init.zeros_(m.bias)
def _forward_impl(self, x:Tensor ) -> Tensor:
# 完成网络中 主体结构的特征抽取;
x = self.features(x)
return x
def forward(self, x:Tensor) -> Tensor:
return self._forward_impl(x)
# author: Chu Yun, contact {chuyunxinlan at gmail dot com}
# time: 2023/3/24
# 下午6:36
# reference the official pytorch1.7.1 code
import torch.nn as nn
import torch.nn.functional as F
def conv3x3(in_plances, out_plances, stride=1, groups=1, dilation=1 ):
'3*3 使用padding, groups=1 不使用分组卷积'
return nn.Conv2d(in_plances, out_plances, kernel_size=3, stride=stride,
padding=dilation, groups=groups, bias=False, dilation=dilation)
def conv1x1(in_plances, out_plances, stride =1):
'1x1 conv, 默认 padding = 0'
return nn.Conv2d(in_plances, out_plances, kernel_size=1, stride=stride, bias=False)
class Bottleneck(nn.Module):
"""
原始的bottleneck, 1x1--> 3x3 --> 1x1, 用于将通道进行降维-》保持通道维度 --> 升维度
"Torchvision中的 bottleneck将下采样的步幅置于3x3卷积(self.conv2),
而原始实现将步幅置于第一个1x1卷积(self.conv1)
根据“图像识别的深度残差学习”https:arxiv.orgabs1512.03385。
此变体也称为 ResNet V1.5,
根据 https:ngc.nvidia.comcatalogmodel-scriptsnvidia:resnet_50_v1_5_for_pytorch 提高了准确性。"
"""
expansion = 2
def __init__(self, inplanes, planes, stride=1, downsanple=None, groups=1,
base_width=64, dilation=1, norm_layer=None, ):
super(Bottleneck, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
# width :1x1 卷积层的输出通道数,用于扩充通道维数;, base width 基础宽度;
width = int(planes * (base_width / 64.)) * groups
# note: bottle neck
# 当输入步长!=1 时, self.conv2 和 self.downsample 将输入进行下采样;
self.conv1 = conv1x1(inplanes, width)
self.bn1 = norm_layer(width)
self.conv2 = conv3x3(width, width, stride, groups, dilation)
self.bn2 = norm_layer(width)
# width 扩充通道维数到 planes * self.expansion
self.conv3 = conv1x1(width, planes * self.expansion)
self.bn3 = norm_layer(planes * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsanple
self.stride = stride
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out =self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
import torch
# note : construct the network
class DepthSepResXt(nn.Module):
def __init__(self, block, layers, num_classes = 4, zero_init_residual=False,
groups=1, width_per_group=64, drop_out=0.1, replace_stride_with_dilation=None,
norm_layer=None):
super(DepthSepResXt, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
self._norm_layer = norm_layer
# note : 这个inplanes 参数 需要和 inputStem 最后的输出通道数一样;
self.inplanes = 72
self.dilation = 1
if replace_stride_with_dilation is None:
# 元组中的每个元素都指示我们是否应该用 空洞卷积代替 2x2 步幅,
replace_stride_with_dilation = [ False, False, False]
#replace_stride_with_dilation = [True,True,True,True,True,True,True,True, True ]
if len(replace_stride_with_dilation) != 3:
# 确保输入的替代步长的的元素是3个;
raise ValueError(" replce_stride_with_dilation should be None"
"or a 3-element tuple ,got {}".format(replace_stride_with_dilation))
self.groups = groups
self.base_width = width_per_group # 设置每组中的通道数目;
width_mult = 1
bneck_conf = partial(InvertedResidualConfig, width_mult=width_mult)
# def __init__(self,
# input_channels: int, kernel: int,
# expanded_channels: int, out_channels: int,
# use_se: bool, activation:str,
# stride: int, dilation: int,
# width_mult: float,
# ): # expanede_channels 是用于se_layer, 中间的扩充通道数
# 各个参数意义:
# 输入通道数; kernel大小; 3x3中间的扩充通道数; 输出通道数;
# 开启通道注意力;'RE: relu'激活函数类型; 步长; 空洞卷积大小;
# 另外,width_mult通过partial 函数作为可变参数传入
self.inverted_residual_setting = [
# 输入通道数 和输出通道数相同时, 才会开启残差连接;
# note: 这里设置, 输入通道数, 和输出通道数 必须是整数倍的关系;
# 因为这里设置每个 inverted_residual 都使用了 groups;
# 而groups 的torch 设置了必须, input_channl, output_channel 都能被groups整除;
bneck_conf(18, 3, 18, 18, True, "RE", 1, 1), # C1
bneck_conf(18, 3, 36, 36, True, "RE", 1, 1), # C2
bneck_conf(36, 3, 72, 72, True, "RE", 1, 1),
]
self.input_steam = InputSteam(self.inverted_residual_setting)
self.layer1 = self._make_layer(block, 72, layers[0])
self.layer2 = self._make_layer(block, 118,layers[1],
stride=2, dilate=replace_stride_with_dilation[0])
self.layer3 = self._make_layer(block, 128,layers[2],
stride=2, dilate=replace_stride_with_dilation[1])
self.layer4 = self._make_layer(block, 156, layers[3],
stride=2, dilate=replace_stride_with_dilation[2])
self.layer5 = self._make_layer(block, 186, layers[4],
stride=2, dilate=replace_stride_with_dilation[2])
self.layer6 = self._make_layer(block, 206, layers[5],
stride=2, dilate=replace_stride_with_dilation[2])
self.layer7 = self._make_layer(block, 226, layers[6],
stride=2, dilate=replace_stride_with_dilation[2])
self.layer8 = self._make_layer(block, 236, layers[7],
stride=2, dilate=replace_stride_with_dilation[2])
self.layer9 = self._make_layer(block, 246, layers[8],
stride=2, dilate=replace_stride_with_dilation[2])
self.layer10 = self._make_layer(block, 256, layers[9],
stride=2, dilate=replace_stride_with_dilation[2])
self.fc = nn.Sequential(
nn.Dropout(drop_out), nn.Linear(2048, 128), nn.ReLU(True),
nn.Dropout(drop_out), nn.Linear(128, 128), nn.ReLU(True))
self.cls_fc = nn.Linear(128, num_classes)
# self.avgpool = nn.AdaptiveAvgPool2d( (1,1))
# self.fc = nn.Linear(256* block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
# Zero-initialize the last BN in each residual branch,
# so that the residual branch starts with zeros, and each residual block behaves like an identity.
# This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
if zero_init_residual:
for m in self.modules():
if isinstance(m, Bottleneck):
nn.init.constant_(m.bn3.weight, 0)
# elif isinstance(m, BasicBlock):
# nn.init.constant_(m.bn2.weight, 0)
def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
# block 第一个block 是规定的基本的模块是选中 basic_block or BottleNeck
# 第二个block 该基础模块堆叠, 多少次;
norm_layer = self._norm_layer
downsample = None
previous_dilation = self.dilation
if dilate:
self.dilation *= stride
stride = 1
# note 如果步长不为1, 或者 inplances 和扩充的通道维数不相同, 则使用1x1卷积下采样
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
conv1x1(self.inplanes, planes * block.expansion, stride),
norm_layer(planes * block.expansion),
)
layers = []
# def __init__(self, inplanes, planes, stride=1, downsanple=None, groups=1,
# base_width=64, dilation=1, norm_layer=None, ):
# note 先确定本层中的 第一个模块, 使用给定的参数,初始化第一个模块;
layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
self.base_width, previous_dilation, norm_layer))
# note: 每层中所有bottle Neck, 第一个bottleNect中输入通道数是给定的输入通道数,
# 其他的bottleNeck 输入通道数 = 给定通道数 × expansion倍数;
# note : 这里将后续的每个block 中的输入通道数 扩充 expansion 倍数;
self.inplanes = planes * block.expansion
# note: 这里从1开始, 按照给定的blocks, 确定本层堆叠几次block,
for _ in range(1, blocks):
layers.append(block(self.inplanes, planes, groups = self.groups,
base_width=self.base_width, dilation= self.dilation,
norm_layer=norm_layer))
return nn.Sequential(*layers)
def _forward_impl(self, x):
# x = self.conv1(x)
# x = self.bn1(x)
# x = self.relu(x)
# x = self.maxpool(x)
x = self.input_steam(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.layer5(x)
x = self.layer6(x)
x = self.layer7(x)
x = self.layer8(x)
x = self.layer9(x)
x = self.layer10(x)
#x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.fc(x)
x = self.cls_fc(x)
return x
def forward(self, x):
return self._forward_impl(x)
def _resnet(arch, block, layers, pretrained, progress, **kwargs):
# arch 用于下载该模型对应的权重,
# 实例化该网络模型;
model = DepthSepResXt(block, layers, **kwargs)
if pretrained:
pass
# state_dict = load_state_dict_from_url(model_urls[arch],
# progress=progress)
# model.load_state_dict(state_dict)
return model
def inverted_resxt_32x4d(pretrained=False, progress=True, **kwargs):
kwargs['groups'] = 32 # 该参数用于将输入的通道数进行分组, 分成的组成也是 分支数,
kwargs['width_per_group'] = 4 # 规定每组中卷积核的的数目;
# 列表中的个数, 代表堆叠多少层的layer, 其中的每一项代表该层堆叠多少个BottleNect;
return _resnet('depth_separa_resxt_32x4d', Bottleneck, [2,2,2,2, 1,1,1,1, 1,1],
pretrained, progress, **kwargs)
if __name__ == "__main__":
model_resxt = inverted_resxt_32x4d(num_classes=4,)
image = torch.rand(1, 9, 576, 600)
out = model_resxt(image)
print(out.shap
2.2 重新设计轻量化的结构
由于每层中, 都使用了 groups=32, 多分支的网络结构;
轻量化的结构,
1. 前期使用invterted_residual + 通道注意力 2层, 实现通道扩充
2. 中间使用 resNeXt 4层,每层中使用2个bottleNeck, 带有分支功能的bottleNeck 完成多分支并行;
3. 后期使用普通的 BottleNeck, 完成卷积运算。