ResXt网络实现-CSDN博客

1. ResNeXt网络

即在 bottle Neck 　模块的基础上，　实现了每个模块的多分支的并行结构；

2.1 代码实现

# author:  contact {chuyunxinlan at gmail dot com}
# time: 2023/3/24
#       下午6:36

# reference the official pytorch1.7.1 code

import  torch.nn  as  nn
import torch.nn.functional as F



def conv3x3(in_plances, out_plances, stride=1, groups=1, dilation=1 ):
    '3*3  使用padding,  groups=1 不使用分组卷积'
    return  nn.Conv2d(in_plances, out_plances, kernel_size=3, stride=stride,
                      padding=dilation, groups=groups, bias=False, dilation=dilation)


def conv1x1(in_plances, out_plances, stride =1):
    '1x1 conv, 默认 padding = 0'
    return nn.Conv2d(in_plances, out_plances, kernel_size=1, stride=stride, bias=False)



class Bottleneck(nn.Module):
    """
    原始的bottleneck, 1x1--> 3x3 --> 1x1，　用于将通道进行降维－》保持通道维度　--> 升维度
    "Torchvision中的 bottleneck将下采样的步幅置于3x3卷积（self.conv2），
    而原始实现将步幅置于第一个1x1卷积（self.conv1）
    根据“图像识别的深度残差学习”https：arxiv.orgabs1512.03385。
    此变体也称为 ResNet V1.5，
    根据 https：ngc.nvidia.comcatalogmodel-scriptsnvidia:resnet_50_v1_5_for_pytorch 提高了准确性。"
    """

    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsanple=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None, ):
        super(Bottleneck, self).__init__()

        if norm_layer is None:
            norm_layer = nn.BatchNorm2d

        #　width ：１x1　卷积层的输出通道数，用于扩充通道维数；,   base width 基础宽度;
        width = int(planes * (base_width / 64.))  * groups

        # note: bottle neck
        #　当输入步长!=1 时，　 self.conv2 和 self.downsample　将输入进行下采样；

        self.conv1 = conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)

        self.conv2 = conv3x3(width, width, stride, groups, dilation)
        self.bn2 = norm_layer(width)

        # 　width 扩充通道维数到　planes * self.expansion
        self.conv3 = conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)

        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsanple
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)


        out =self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return  out

import  torch

# note : construct  the network
class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes = 4,  zero_init_residual=False,
                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
                 norm_layer=None):

        super(ResNet, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d

        self._norm_layer = norm_layer

        self.inplanes = 64
        self.dilation = 1

        if replace_stride_with_dilation is None:
            # 元组中的每个元素都指示我们是否应该用　空洞卷积代替 2x2 步幅,
            replace_stride_with_dilation = [ False, False, False]

        if len(replace_stride_with_dilation) != 3:
            # 确保输入的替代步长的的元素是3个;
            raise ValueError(" replce_stride_with_dilation should  be  None"
                             "or a 3-element tuple ,got {}".format(replace_stride_with_dilation))

        self.groups = groups
        self.base_width = width_per_group # 设置每组中的通道数目;

        # note: 此处用于修改，　网络的输入通道数;
        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1   = norm_layer(self.inplanes)
        self.relu  = nn.ReLU(inplace=True)

        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128,layers[1],
                                       stride=2, dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block, 256,layers[2],
                                       stride=2, dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block, 512, layers[3],
                                       stride=2, dilate=replace_stride_with_dilation[2])

        self.avgpool = nn.AdaptiveAvgPool2d( (1,1))
        self.fc      = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)
                # elif isinstance(m, BasicBlock):
                    # nn.init.constant_(m.bn2.weight, 0)



    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        # block 第一个block 是规定的基本的模块是选中 basic_block or  BottleNeck
        # 　第二个block 该基础模块堆叠，　多少次；

        norm_layer = self._norm_layer
        downsample = None

        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride  = 1

        # note 如果步长不为１， 或者 inplances 和扩充的通道维数不相同，　则使用1x1卷积下采样
        if stride != 1 or self.inplanes  != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                norm_layer(planes * block.expansion),
            )

        layers = []

        #    def __init__(self, inplanes, planes, stride=1, downsanple=None, groups=1,
        #         base_width=64, dilation=1, norm_layer=None, ):
        # note 先确定本层中的 第一个模块， 使用给定的参数，初始化第一个模块；
        layers.append(block(self.inplanes,  planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))

        # note: 每层中所有bottle Neck,  第一个bottleNect中输入通道数是给定的输入通道数，
        #  　其他的bottleNeck 输入通道数 ＝　给定通道数　×　expansion倍数；
        # note : 这里将后续的每个block 中的输入通道数　扩充 expansion 倍数；
        self.inplanes = planes * block.expansion

        # note: 这里从１开始，　 按照给定的blocks, 确定本层堆叠几次block,
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes,  groups = self.groups,
                                base_width=self.base_width,  dilation= self.dilation,
                                norm_layer=norm_layer))

        return  nn.Sequential(*layers)


    def _forward_impl(self, x):

        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return  x

    def  forward(self, x):
        return  self._forward_impl(x)



def _resnet(arch, block, layers, pretrained, progress, **kwargs):
    # arch 用于下载该模型对应的权重，

    # 实例化该网络模型；
    model = ResNet(block, layers, **kwargs)

    if pretrained:
        pass
        # state_dict = load_state_dict_from_url(model_urls[arch],
        #                                       progress=progress)
        # model.load_state_dict(state_dict)

    return model



def resnext50_32x4d(pretrained=False, progress=True, **kwargs):

    kwargs['groups'] = 32 # 该参数用于将输入的通道数进行分组，　分成的组成也是 分支数，
    kwargs['width_per_group']  = 4  # 规定每组中卷积核的的数目；

    # 列表中的个数，　代表堆叠多少层的layer, 其中的每一项代表该层堆叠多少个BottleNect;
    return  _resnet('resnext50_32x4d', Bottleneck, [3,4, 6, 3],
                    pretrained, progress, **kwargs)



if __name__ ==  "__main__":

    model_resxt = resnext50_32x4d(num_classes=4,)

    image = torch.rand(8, 3, 96, 510)

    out = mode

2.2 网络结构

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): Bottleneck(
      (conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (2): Bottleneck(
      (conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
  )
  (layer2): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): Bottleneck(
      (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (2): Bottleneck(
      (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (3): Bottleneck(
      (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
  )
  (layer3): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): Bottleneck(
      (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (2): Bottleneck(
      (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (3): Bottleneck(
      (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (4): Bottleneck(
      (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (5): Bottleneck(
      (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(512, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
  )
  (layer4): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(1024, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): Bottleneck(
      (conv1): Conv2d(2048, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (2): Bottleneck(
      (conv1): Conv2d(2048, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(2048, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(1, 1))
  (fc): Linear(in_features=2048, out_features=4, bias=True)
)

2. 自主构建的网络

主要的结构，是前期使用了inverted_residual 结构，
目的是为了不让各个通道之间的信息，交流。
保持原始的通道信息，
这样一直到ResXt的网络结构时，　才开始通道交融；

该网络，由于采用了多分支并行的结构，导致了该网络不能够搭建的很深
在( 2, 9, 576, 600)　的情况下，占用了20G的显存，　100G的　内存；

2.1 网络结构

DepthSepResXt(
  (input_steam): InputSteam(
    (features): Sequential(
      (0): ConvNormActivation(
        (0): Conv2d(9, 18, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=9, bias=False)
        (1): BatchNorm2d(18, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
        (2): Hardswish()
      )
      (1): InvertedResidual(
        (block): Sequential(
          (0): ConvNormActivation(
            (0): Conv2d(18, 18, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=18, bias=False)
            (1): BatchNorm2d(18, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(18, 9, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(9, 18, kernel_size=(1, 1), stride=(1, 1))
            (activation): ReLU()
            (scale_activation): Hardsigmoid()
          )
          (2): ConvNormActivation(
            (0): Conv2d(18, 18, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(18, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
          )
        )
      )
      (2): InvertedResidual(
        (block): Sequential(
          (0): ConvNormActivation(
            (0): Conv2d(18, 36, kernel_size=(1, 1), stride=(1, 1), groups=18, bias=False)
            (1): BatchNorm2d(36, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
          )
          (1): ConvNormActivation(
            (0): Conv2d(36, 36, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=36, bias=False)
            (1): BatchNorm2d(36, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
          )
          (2): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(36, 18, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(18, 36, kernel_size=(1, 1), stride=(1, 1))
            (activation): ReLU()
            (scale_activation): Hardsigmoid()
          )
          (3): ConvNormActivation(
            (0): Conv2d(36, 36, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(36, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
          )
        )
      )
      (3): InvertedResidual(
        (block): Sequential(
          (0): ConvNormActivation(
            (0): Conv2d(36, 72, kernel_size=(1, 1), stride=(1, 1), groups=36, bias=False)
            (1): BatchNorm2d(72, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
          )
          (1): ConvNormActivation(
            (0): Conv2d(72, 72, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=72, bias=False)
            (1): BatchNorm2d(72, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
          )
          (2): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(72, 36, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(36, 72, kernel_size=(1, 1), stride=(1, 1))
            (activation): ReLU()
            (scale_activation): Hardsigmoid()
          )
          (3): ConvNormActivation(
            (0): Conv2d(72, 72, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(72, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
          )
        )
      )
    )
  )
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(72, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(128, 144, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(72, 144, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): Bottleneck(
      (conv1): Conv2d(144, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(128, 144, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
  )
  (layer2): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(144, 224, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(224, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(224, 224, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(224, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(224, 236, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(236, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(144, 236, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(236, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): Bottleneck(
      (conv1): Conv2d(236, 224, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(224, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(224, 224, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(224, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(224, 236, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(236, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
  )
  (layer3): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(236, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(236, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): Bottleneck(
      (conv1): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
  )
  (layer4): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(256, 288, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(288, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(288, 288, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(288, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(288, 312, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(312, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(256, 312, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(312, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): Bottleneck(
      (conv1): Conv2d(312, 288, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(288, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(288, 288, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(288, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(288, 312, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(312, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
  )
  (layer5): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(312, 352, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(352, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(352, 352, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(352, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(352, 372, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(372, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(312, 372, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(372, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
  (layer6): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(372, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(384, 384, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(384, 412, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(412, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(372, 412, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(412, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
  (layer7): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(412, 448, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(448, 448, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(448, 452, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(452, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(412, 452, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(452, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
  (layer8): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(452, 448, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(448, 448, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(448, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(448, 472, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(472, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(452, 472, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(472, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
  (layer9): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(472, 480, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(480, 480, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(480, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(480, 492, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(492, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(472, 492, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(492, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
  (layer10): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(492, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(492, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
  (fc): Sequential(
    (0): Dropout(p=0.1, inplace=False)
    (1): Linear(in_features=2048, out_features=128, bias=True)
    (2): ReLU(inplace=True)
    (3): Dropout(p=0.1, inplace=False)
    (4): Linear(in_features=128, out_features=128, bias=True)
    (5): ReLU(inplace=True)
  )
  (cls_fc): Linear(in_features=128, out_features=4, bias=True)
)

上述网络结构，是通过如下代码实现的：

# author: Chu Yun,  contact {chuyunxinlan at gmail dot com}
# time: 2023/3/25
#       下午4:06


# 该网络实现了，
# 1. 先使用inverted_residual 模块，使用通道深度可分离的操作，　
# 将输入的通道数扩充到64个通道, 并且这个过程中保持特征图大小不变；

# 2.  invted_residual 模块中，使用了 se_layer 通道注意力机制, 用于筛选通道;
# 3.  将扩充的64通道对接到 ResXt的主干网络中去；



from typing import Any, Callable, List, Optional, Sequence
from types import  FunctionType

import  torch
from torch import Tensor


# author: Chu Yun,  contact {chuyunxinlan at gmail dot com}
# time: 2023/3/21
#       下午6:18

# reference the  official pytorch code

from typing import Any, Callable, List, Optional, Sequence
from types import  FunctionType

import  torch
from torch import Tensor




def _log_api_usage_once(obj: Any) -> None:

    """
    Logs API usage(module and name) within an organization.
    In a large ecosystem, it's often useful to track the PyTorch and
    TorchVision APIs usage. This API provides the similar functionality to the
    logging module in the Python stdlib. It can be used for debugging purpose
    to log which methods are used and by default it is inactive, unless the user
    manually subscribes a logger via the `SetAPIUsageLogger method <https://github.com/pytorch/pytorch/blob/eb3b9fe719b21fae13c7a7cf3253f970290a573e/c10/util/Logging.cpp#L114>`_.
    Please note it is triggered only once for the same API call within a process.
    It does not collect any data from open-source users since it is no-op by default.
    For more information, please refer to
    * PyTorch note: https://pytorch.org/docs/stable/notes/large_scale_deployments.html#api-usage-logging;
    * Logging policy: https://github.com/pytorch/vision/issues/5052;

    Args:
        obj (class instance or method): an object to extract info from.
    """
    if not obj.__module__.startswith("torchvision"):
        return
    name = obj.__class__.__name__
    if isinstance(obj, FunctionType):
        name = obj.__name__
    torch._C._log_api_usage_once(f"{obj.__module__}.{name}")



# 用于确保输入的通道数　是最靠近数的整数倍;
def _make_divisiable(v, divisor, min_value=None):
    """
    This function is taken from the original tf repo.
    It ensures that all layers have a channel number that is divisible by 8
    It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    :param v:
    :param divisor:
    :param min_value:
    :return:
    """
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    # Make sure that round down does not go down by more than 10%.
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v



class ConvNormActivation(torch.nn.Sequential):
    """
    Configurable block used for Convolution-Normalzation-Activation blocks.

    Args:
        in_channels (int): Number of channels in the input image
        out_channels (int): Number of channels produced by the Convolution-Normalzation-Activation block
        kernel_size: (int, optional): Size of the convolving kernel. Default: 3
        stride (int, optional): Stride of the convolution. Default: 1
        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in wich case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolutiuon layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm2d``
        activation_layer (Callable[..., torch.nn.Module], optinal): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
        dilation (int): Spacing between kernel elements. Default: 1
        inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
        bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.

    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int = 3,
        stride: int = 1,
        padding: Optional[int] = None,
        groups: int = 1,
        norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm2d,
        activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
        dilation: int = 1,
        inplace: Optional[bool] = True,
        bias: Optional[bool] = None,
    ) -> None:
        # note: 注意　这里设置的默认padding 并非0, 通常如果kernel =3, 默认padding =1,  如果kernel= 1, ｐadding=0
        if padding is None:
            padding = (kernel_size - 1) // 2 * dilation
        if bias is None:
            bias = norm_layer is None
        layers = [
            torch.nn.Conv2d(
                in_channels,
                out_channels,
                kernel_size,
                stride,
                padding,
                dilation=dilation,
                groups=groups,
                bias=bias,
            )
        ]
        if norm_layer is not None:
            layers.append(norm_layer(out_channels))
        if activation_layer is not None:
            params = {} if inplace is None else {"inplace": inplace}
            layers.append(activation_layer(**params))
        super().__init__(*layers)
        _log_api_usage_once(self)
        self.out_channels = out_channels


# note 通道注意力模块,  和CBAM 模块的区别，他缺少了空间注意力模块；
class SqueezeExcitation(torch.nn.Module):
    """
    This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1).
    Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in in eq. 3.

    Args:
        input_channels (int): Number of channels in the input image
        squeeze_channels (int): Number of squeeze channels
        activation (Callable[..., torch.nn.Module], optional): ``delta`` activation. Default: ``torch.nn.ReLU``
        scale_activation (Callable[..., torch.nn.Module]): ``sigma`` activation. Default: ``torch.nn.Sigmoid``
    """


    def __init__(self, input_channels:int,  squeeze_channels: int,
                 activation: Callable[...,torch.nn.Module] = torch.nn.ReLU,
                 scale_activation: Callable[..., torch.nn.Module] = torch.nn.Sigmoid,
                 ) -> None:
        super().__init__()
        _log_api_usage_once(self)   #记录调用该类的使用情况
        self.avgpool = torch.nn.AdaptiveAvgPool2d(1)

        # 使用1*1 构成全连接层；
        self.fc1 = torch.nn.Conv2d(input_channels, squeeze_channels, 1)
        self.fc2 = torch.nn.Conv2d(squeeze_channels, input_channels, 1)

        self.activation  = activation()
        self.scale_activation = scale_activation()

    def _scale(self, input:Tensor) ->Tensor:
        scale = self.avgpool(input)
        scale = self.fc1(scale)
        scale = self.activation(scale)
        scale = self.fc2(scale)

        return self.scale_activation(scale)

    def forward(self, input:Tensor) -> Tensor:
        scale = self._scale(input)

        return  scale * input




class  InvertedResidualConfig:
     # Stores information listed at Tables 1 and 2 of the MobileNetV3 paper
     # 用于存储　配置参数信息，　使用不同的参数配置形成不同的对象　inverted Residual 模块
     def __init__(self,
                  input_channels: int,  kernel: int,
                  expanded_channels: int,  out_channels: int,
                  use_se: bool,  activation:str,
                  stride: int, dilation: int,
                  width_mult: float,
                  ):
        self.input_channels = self.adjust_channels(input_channels, width_mult)
        self.kernel  = kernel
        self.expanded_channels = self.adjust_channels(expanded_channels, width_mult)
        self.out_channels      =  self.adjust_channels(out_channels, width_mult)

        self.use_se = use_se  # 是否使用　se 通道注意力;
        self.use_hs = activation  == "HS"  # 确定激活函数的类型，是否使用　hard sigmoid
        self.stride = stride
        self.dilation = dilation   # 是否开启空洞卷积；

     @staticmethod
     def adjust_channels(channels:int,  width_mult: float):
        # 用于调整通道的个数，使得通道数目是最接近规定数值的整数倍附近;
        # note: 这里规定了每个模块中，　通道必须满足的倍数；
        return  _make_divisiable(channels * width_mult, 9)



from  torch import  nn
from  functools import partial

class  InvertedResidual(nn.Module):
        # Implemented as described at section 5 of MobileNetV3 paper
        def __init__(self,
                     cnf: InvertedResidualConfig,
                     norm_layer: Callable[..., nn.Module],
                     se_layer: Callable[..., nn.Module] = partial(SqueezeExcitation, scale_activation = nn.Hardsigmoid)
                     ):
            super().__init__()
            if not ( 1 <= cnf.stride <= 2): # 限制滑动步长只在1, 2之间;
                raise ValueError(" illegal stride value")

            # note : 规定了使用残差连接的条件，需要 同时满足 步长为1,　并且 输入通道数和　输出通道数相同
            self.use_res_connet = cnf.stride == 1 and cnf.input_channels == cnf.out_channels

            layers: List[nn.Module] = []
            activation_layer = nn.Hardswish  if cnf.use_hs  else nn.ReLU


            #　1*1 pw:  当输入通道数 和扩展通道数　不相同时, 扩充通道数
            if cnf.expanded_channels  !=  cnf.input_channels:
                layers.append(
                    ConvNormActivation(
                        cnf.input_channels,
                        cnf.expanded_channels,
                        kernel_size= 1,
                        groups=cnf.input_channels,
                        norm_layer=norm_layer,
                        activation_layer=activation_layer,
                    )
                )

            # depthwise:   使用分组卷积，　将输入通道数进行分组;
            # 当使用空洞卷积时，　stride = 1; 否则　stride 取给定的值
            stride = 1  if cnf.dilation >1  else cnf.stride
            layers.append(
                ConvNormActivation(
                    cnf.expanded_channels,
                    cnf.expanded_channels,
                    kernel_size= cnf.kernel,
                    stride=stride,
                    dilation= cnf.dilation,
                    groups=cnf.expanded_channels,
                    norm_layer=norm_layer,
                    activation_layer= activation_layer,
                )
            )

            if cnf.use_se:  # 本层中, 是否引入se 通道注意力模块；
                # note: 这里规定了每个模块中，　通道必须满足的倍数；
                squeeze_channels = _make_divisiable(cnf.expanded_channels // 2, 9)
                # 　规定通道注意力层se_layer 中的　扩展通道和挤压通道；　
                layers.append(se_layer(cnf.expanded_channels, squeeze_channels))


            # 降低通道维数 project
            layers.append(
                ConvNormActivation(
                    cnf.expanded_channels, cnf.out_channels, kernel_size=1, norm_layer=norm_layer, activation_layer=None
                )

            )

            # note self.block 属性
            # 将上述　layers　构成的列表，　通过nn.Sequential()　形成网络中基本模块，　在使用该基本模块构建网络的各个层；
            self.block = nn.Sequential(*layers)

            # 这两个属性貌似没有用到;
            self.out_channels = cnf.out_channels
            self._is_cn = cnf.stride > 1


        def forward(self, input: Tensor) -> Tensor:
            result = self.block(input)

            if self.use_res_connet:  # 确认是否使用残差连接;
                result  += input
            return  result




# note : 使用深度可分离卷积　和通道注意力将输入通道数扩充
class  InputSteam(nn.Module):
    def __init__(self,
                 # 同一个类在实例化，不同的对象时，将不同的参数以列表的形式给出;
                 inverted_residual_setting:List[InvertedResidualConfig],
                 # block 以可迭代的对象传入进来，　并且该可迭代对象是nn.Module 的实例化对象
                 block: Optional[Callable[..., nn.Module]]  = None,
                 norm_layer: Optional[Callable[..., nn.Module]]  = None,
                 dropout: float = 0.2,
                 **kwargs: Any,  # 使用关键字匹配的形式，来传递多个参数；
                 ) -> None:
        """

        Args:
            inverted_residual_setting: 以列表的形式传递传递参数，实例化模块，　构建网络的主体结构
            last_channel:　倒数第二层的通道数
            num_classes:
            block:  指定　基本模块，　使用该基本模块来构建网络
            norm_layer: 　normalization layer　归一化层;
            dropout: dropout  概率;
            **kwargs:
        """
        super().__init__()

        _log_api_usage_once(self)

        if not inverted_residual_setting: # 确定传入配置不为空
            raise ValueError( " The inverted residual setting should not be  empty")


        elif not(  # 并且再一次确定，传入的参数配置是以顺序的形式传入进来;
            isinstance(inverted_residual_setting, Sequence)
              # 其中每一个配置对象,　都是InvertedResidualConfig　类的实例化对象.
            and  all(  [isinstance(s, InvertedResidualConfig)  for s in inverted_residual_setting])
        ):     #　否则，报类型错误
            raise TypeError ("The inverted residual setting should be List[InvertedResidualConfig]")

        if block is None:
            block = InvertedResidual

        if norm_layer is None:
            norm_layer = partial(nn.BatchNorm2d, eps= 0.001, momentum=0.01)

        layers: List[nn.Module] = []

        # note 构建整个网络结构的第一层;
        # note: 此处修改 groups
        firstconv_output_channels = inverted_residual_setting[0].input_channels
        layers.append(
            ConvNormActivation(
                9, firstconv_output_channels,
                kernel_size=3, stride=1,
                groups= 9,
                norm_layer=norm_layer, activation_layer=nn.Hardswish,
            )
        )

        # note :使用配置信息，构建网络的主体模块
        for  cnf in inverted_residual_setting:
            layers.append(block (cnf, norm_layer))


        # note: 将构成的主干网络结构作为一个属性;
        self.features = nn.Sequential(*layers)


        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode= 'fan_out')# 通过随机矩阵显式创建权重
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.ones_(m.weight)
                nn.init.zeros_(m.bias)

            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.zeros_(m.bias)

    def _forward_impl(self, x:Tensor ) -> Tensor:
        #　完成网络中　主体结构的特征抽取;
        x = self.features(x)

        return  x

    def forward(self, x:Tensor) -> Tensor:
        return  self._forward_impl(x)


# author: Chu Yun,  contact {chuyunxinlan at gmail dot com}
# time: 2023/3/24
#       下午6:36

# reference the official pytorch1.7.1 code

import  torch.nn  as  nn
import torch.nn.functional as F



def conv3x3(in_plances, out_plances, stride=1, groups=1, dilation=1 ):
    '3*3  使用padding,  groups=1 不使用分组卷积'
    return  nn.Conv2d(in_plances, out_plances, kernel_size=3, stride=stride,
                      padding=dilation, groups=groups, bias=False, dilation=dilation)


def conv1x1(in_plances, out_plances, stride =1):
    '1x1 conv, 默认 padding = 0'
    return nn.Conv2d(in_plances, out_plances, kernel_size=1, stride=stride, bias=False)



class Bottleneck(nn.Module):
    """
    原始的bottleneck, 1x1--> 3x3 --> 1x1，　用于将通道进行降维－》保持通道维度　--> 升维度
    "Torchvision中的 bottleneck将下采样的步幅置于3x3卷积（self.conv2），
    而原始实现将步幅置于第一个1x1卷积（self.conv1）
    根据“图像识别的深度残差学习”https：arxiv.orgabs1512.03385。
    此变体也称为 ResNet V1.5，
    根据 https：ngc.nvidia.comcatalogmodel-scriptsnvidia:resnet_50_v1_5_for_pytorch 提高了准确性。"
    """

    expansion = 2

    def __init__(self, inplanes, planes, stride=1, downsanple=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None, ):
        super(Bottleneck, self).__init__()

        if norm_layer is None:
            norm_layer = nn.BatchNorm2d

        #　width ：１x1　卷积层的输出通道数，用于扩充通道维数；,   base width 基础宽度;
        width = int(planes * (base_width / 64.))  * groups

        # note: bottle neck
        #　当输入步长!=1 时，　 self.conv2 和 self.downsample　将输入进行下采样；

        self.conv1 = conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)

        self.conv2 = conv3x3(width, width, stride, groups, dilation)
        self.bn2 = norm_layer(width)

        # 　width 扩充通道维数到　planes * self.expansion
        self.conv3 = conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)

        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsanple
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)


        out =self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return  out

import  torch

# note : construct  the network
class  DepthSepResXt(nn.Module):

    def __init__(self, block, layers, num_classes = 4,  zero_init_residual=False,
                 groups=1, width_per_group=64, drop_out=0.1, replace_stride_with_dilation=None,
                 norm_layer=None):

        super(DepthSepResXt, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d

        self._norm_layer = norm_layer

        # note :  　这个inplanes 参数　需要和　inputStem  最后的输出通道数一样；
        self.inplanes = 72
        self.dilation = 1

        if replace_stride_with_dilation is None:
            # 元组中的每个元素都指示我们是否应该用　空洞卷积代替 2x2 步幅,
            replace_stride_with_dilation = [ False, False, False]
            #replace_stride_with_dilation = [True,True,True,True,True,True,True,True, True ]

        if len(replace_stride_with_dilation) != 3:
            # 确保输入的替代步长的的元素是3个;
            raise ValueError(" replce_stride_with_dilation should  be  None"
                             "or a 3-element tuple ,got {}".format(replace_stride_with_dilation))

        self.groups = groups
        self.base_width = width_per_group # 设置每组中的通道数目;

        width_mult = 1
        bneck_conf = partial(InvertedResidualConfig, width_mult=width_mult)

        #     def __init__(self,
        #               input_channels: int,  kernel: int,
        #               expanded_channels: int,  out_channels: int,
        #               use_se: bool,  activation:str,
        #               stride: int, dilation: int,
        #               width_mult: float,
        #               ):　　# expanede_channels 是用于se_layer, 中间的扩充通道数

        # 各个参数意义: 　
        # 输入通道数；　kernel大小；　3x3中间的扩充通道数；　输出通道数；　
        # 开启通道注意力；'RE: relu'激活函数类型；　步长；　空洞卷积大小；　
        # 另外，width_mult通过partial 函数作为可变参数传入

        self.inverted_residual_setting = [
            # 　输入通道数　和输出通道数相同时，　才会开启残差连接;
            # note: 这里设置，　输入通道数，　和输出通道数　必须是整数倍的关系；
            # 因为这里设置每个 inverted_residual 都使用了 groups;
            # 而groups 的torch　设置了必须，　input_channl, output_channel 都能被groups整除；
            bneck_conf(18, 3, 18, 18, True, "RE", 1, 1),  # C1
            bneck_conf(18, 3, 36, 36, True, "RE", 1, 1),  # C2
            bneck_conf(36, 3, 72, 72, True, "RE", 1, 1),

        ]

        self.input_steam = InputSteam(self.inverted_residual_setting)


        self.layer1 = self._make_layer(block, 72, layers[0])
        self.layer2 = self._make_layer(block, 118,layers[1],
                                       stride=2, dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block, 128,layers[2],
                                       stride=2, dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block, 156, layers[3],
                                       stride=2, dilate=replace_stride_with_dilation[2])


        self.layer5 = self._make_layer(block, 186, layers[4],
                                       stride=2, dilate=replace_stride_with_dilation[2])
        self.layer6 = self._make_layer(block, 206, layers[5],
                                       stride=2, dilate=replace_stride_with_dilation[2])

        self.layer7 = self._make_layer(block, 226, layers[6],
                                       stride=2, dilate=replace_stride_with_dilation[2])
        self.layer8 = self._make_layer(block, 236, layers[7],
                                       stride=2, dilate=replace_stride_with_dilation[2])

        self.layer9 = self._make_layer(block, 246, layers[8],
                                       stride=2, dilate=replace_stride_with_dilation[2])

        self.layer10 = self._make_layer(block, 256, layers[9],
                                       stride=2, dilate=replace_stride_with_dilation[2])

        self.fc = nn.Sequential(
                                nn.Dropout(drop_out), nn.Linear(2048, 128), nn.ReLU(True),
                                nn.Dropout(drop_out), nn.Linear(128, 128), nn.ReLU(True))

        self.cls_fc = nn.Linear(128, num_classes)



        # self.avgpool = nn.AdaptiveAvgPool2d( (1,1))
        # self.fc      = nn.Linear(256* block.expansion, num_classes)


        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)
                # elif isinstance(m, BasicBlock):
                    # nn.init.constant_(m.bn2.weight, 0)



    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        # block 第一个block 是规定的基本的模块是选中 basic_block or  BottleNeck
        # 　第二个block 该基础模块堆叠，　多少次；

        norm_layer = self._norm_layer
        downsample = None

        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride  = 1

        # note 如果步长不为１， 或者 inplances 和扩充的通道维数不相同，　则使用1x1卷积下采样
        if stride != 1 or self.inplanes  != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                norm_layer(planes * block.expansion),
            )

        layers = []

        #    def __init__(self, inplanes, planes, stride=1, downsanple=None, groups=1,
        #         base_width=64, dilation=1, norm_layer=None, ):
        # note 先确定本层中的 第一个模块， 使用给定的参数，初始化第一个模块；
        layers.append(block(self.inplanes,  planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))

        # note: 每层中所有bottle Neck,  第一个bottleNect中输入通道数是给定的输入通道数，
        #  　其他的bottleNeck 输入通道数 ＝　给定通道数　×　expansion倍数；
        # note : 这里将后续的每个block 中的输入通道数　扩充 expansion 倍数；
        self.inplanes = planes * block.expansion

        # note: 这里从１开始，　 按照给定的blocks, 确定本层堆叠几次block,
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes,  groups = self.groups,
                                base_width=self.base_width,  dilation= self.dilation,
                                norm_layer=norm_layer))

        return  nn.Sequential(*layers)


    def _forward_impl(self, x):

        # x = self.conv1(x)
        # x = self.bn1(x)
        # x = self.relu(x)
        # x = self.maxpool(x)

        x = self.input_steam(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.layer5(x)
        x = self.layer6(x)
        x = self.layer7(x)
        x = self.layer8(x)

        x = self.layer9(x)
        x = self.layer10(x)
        #x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        x = self.cls_fc(x)

        return  x

    def  forward(self, x):
        return  self._forward_impl(x)



def _resnet(arch, block, layers, pretrained, progress, **kwargs):
    # arch 用于下载该模型对应的权重，

    # 实例化该网络模型；
    model = DepthSepResXt(block, layers, **kwargs)

    if pretrained:
        pass
        # state_dict = load_state_dict_from_url(model_urls[arch],
        #                                       progress=progress)
        # model.load_state_dict(state_dict)

    return model



def inverted_resxt_32x4d(pretrained=False, progress=True, **kwargs):

    kwargs['groups'] = 32 # 该参数用于将输入的通道数进行分组，　分成的组成也是 分支数，
    kwargs['width_per_group']  = 4  # 规定每组中卷积核的的数目；

    # 列表中的个数，　代表堆叠多少层的layer, 其中的每一项代表该层堆叠多少个BottleNect;
    return  _resnet('depth_separa_resxt_32x4d', Bottleneck, [2,2,2,2, 1,1,1,1, 1,1],
                    pretrained, progress, **kwargs)



if __name__ ==  "__main__":

    model_resxt = inverted_resxt_32x4d(num_classes=4,)
    image = torch.rand(1, 9, 576, 600)

    out = model_resxt(image)

    print(out.shap

2.2 　重新设计轻量化的结构

由于每层中，　都使用了　groups=32, 多分支的网络结构；

轻量化的结构，
1.　前期使用invterted_residual + 通道注意力　2层，　　实现通道扩充
2. 中间使用　resNeXt 　4层，每层中使用2个bottleNeck, 带有分支功能的bottleNeck　完成多分支并行；
3. 　后期使用普通的　BottleNeck, 完成卷积运算。