代码注释如下(注意:ConvNext和CSPdarknet网络可以根据前面几个章节的内容自己写):
import torch import torch.nn as nn from ConvNext import ConvNeXt_Small, ConvNeXt_Tiny from CSPdarknet import C3, Conv, CSPDarknet # ---------------------------------------------------# # yolo_body # ---------------------------------------------------# class YoloBody(nn.Module): def __init__(self, anchors_mask= [[6,7,8], [3,4,5], [0,1,2]], num_classes=10, phi="s", backbone='cspdarknet', pretrained=False, input_shape=[640, 640]): super(YoloBody, self).__init__() depth_dict = {'s': 0.33, 'm': 0.67, 'l': 1.00, 'x': 1.33, } width_dict = {'s': 0.50, 'm': 0.75, 'l': 1.00, 'x': 1.25, } dep_mul, wid_mul = depth_dict[phi], width_dict[phi] base_channels = int(wid_mul * 64) # 64 base_depth = max(round(dep_mul * 3), 1) # 3 # -----------------------------------------------# # 输入图片是640, 640, 3 # 初始的基本通道是64 # -----------------------------------------------# self.backbone_name = backbone if backbone == "cspdarknet": # ---------------------------------------------------# # 生成CSPdarknet53的主干模型 # 获得三个有效特征层,他们的shape分别是: # 80,80,256 # 40,40,512 # 20,20,1024 # ---------------------------------------------------# self.backbone = CSPDarknet(base_channels, base_depth, phi, pretrained) else: # ---------------------------------------------------# # 如果输入不为cspdarknet,则调整通道数 # 使其符合YoloV5的格式 # ---------------------------------------------------# self.backbone = { 'convnext_tiny': ConvNeXt_Tiny, 'convnext_small': ConvNeXt_Small, #'swin_transfomer_tiny': Swin_transformer_Tiny, }[backbone](pretrained=pretrained, input_shape=input_shape) in_channels = { 'convnext_tiny': [192, 384, 768], 'convnext_small': [192, 384, 768], #'swin_transfomer_tiny': [192, 384, 768], }[backbone] feat1_c, feat2_c, feat3_c = in_channels self.conv_1x1_feat1 = Conv(feat1_c, base_channels * 4, 1, 1) self.conv_1x1_feat2 = Conv(feat2_c, base_channels * 8, 1, 1) self.conv_1x1_feat3 = Conv(feat3_c, base_channels * 16, 1, 1) #上采样,输出大小为输入的二倍,用最近邻法进行上采样 self.upsample = nn.Upsample(scale_factor=2, mode="nearest") #base_channels=64,输入通道数为16*64,输出为8*64,这里用来上采样,用了一个卷积层和C3层,C3层是保证输出的大小一定.注意:上采样时通道数减少 self.conv_for_feat3 = Conv(base_channels * 16, base_channels * 8, 1, 1) self.conv3_for_upsample1 = C3(base_channels * 16, base_channels * 8, base_depth, shortcut=False) # base_channels=64,输入通道数为8*64,输出为4*64,这里用来上采样,用了一个卷积层和C3层,C3层是保证输出的大小一定 self.conv_for_feat2 = Conv(base_channels * 8, base_channels * 4, 1, 1) self.conv3_for_upsample2 = C3(base_channels * 8, base_channels * 4, base_depth, shortcut=False) # base_channels=64,输入通道数为4*64,输出为4*64,这里用来下采样,用了一个卷积层和C3层,C3层是保证输出的大小一定 self.down_sample1 = Conv(base_channels * 4, base_channels * 4, 3, 2) self.conv3_for_downsample1 = C3(base_channels * 8, base_channels * 8, base_depth, shortcut=False) # base_channels=64,输入通道数为8*64,输出为8*64,这里用来下采样,用了一个卷积层和C3层,C3层是保证输出的大小一定 self.down_sample2 = Conv(base_channels * 8, base_channels * 8, 3, 2) self.conv3_for_downsample2 = C3(base_channels * 16, base_channels * 16, base_depth, shortcut=False) #YOLOV5的头部,输出三层不同的特征 # 80, 80, 256 => 80, 80, 3 * (5 + num_classes) => 80, 80, 3 * (4 + 1 + num_classes) self.yolo_head_P3 = nn.Conv2d(base_channels * 4, len(anchors_mask[2]) * (5 + num_classes), 1) # 40, 40, 512 => 40, 40, 3 * (5 + num_classes) => 40, 40, 3 * (4 + 1 + num_classes) self.yolo_head_P4 = nn.Conv2d(base_channels * 8, len(anchors_mask[1]) * (5 + num_classes), 1) # 20, 20, 1024 => 20, 20, 3 * (5 + num_classes) => 20, 20, 3 * (4 + 1 + num_classes) self.yolo_head_P5 = nn.Conv2d(base_channels * 16, len(anchors_mask[0]) * (5 + num_classes), 1) def forward(self, x): # backbone feat1, feat2, feat3 = self.backbone(x) if self.backbone_name != "cspdarknet": feat1 = self.conv_1x1_feat1(feat1) feat2 = self.conv_1x1_feat2(feat2) feat3 = self.conv_1x1_feat3(feat3) # 20, 20, 1024 -> 20, 20, 512 P5 = self.conv_for_feat3(feat3)#用Conv网络降通道数:1024---->512 # 20, 20, 512 -> 40, 40, 512 上采样 P5_upsample = self.upsample(P5)#通道数不变,尺寸变为2倍,20X20X512---->40X40X512 # 40, 40, 512 -> 40, 40, 1024 P4 = torch.cat([P5_upsample, feat2], 1) #cat是将通道数合并,就得到512+512=1024,这里的特征P5_upsample和feat2是并列的 # 40, 40, 1024 -> 40, 40, 512 P4 = self.conv3_for_upsample1(P4)#用C3网络降通道数:1024---->512 # 40, 40, 512 -> 40, 40, 256 P4 = self.conv_for_feat2(P4)#用Conv网络降通道数:512---->256 # 40, 40, 256 -> 80, 80, 256 上采样 P4_upsample = self.upsample(P4)#通道数不变,尺寸变为2倍,40X40X256---->80X80X256 # 80, 80, 256 cat 80, 80, 256 -> 80, 80, 512 P3 = torch.cat([P4_upsample, feat1], 1)#cat是将通道数合并,就得到256+256=512,这里的特征P4_upsample和feat1是并列的 # 80, 80, 512 -> 80, 80, 256 P3 = self.conv3_for_upsample2(P3)#用C3网络降通道数:512---->256 # 80, 80, 256 -> 40, 40, 256 P3_downsample = self.down_sample1(P3)#利用Conv网络,输入输出的通道数相同,卷积核为3,步长为2的卷积进行下采样,结果是输出大小是输入大小的二分之一 # 40, 40, 256 cat 40, 40, 256 -> 40, 40, 512 P4 = torch.cat([P3_downsample, P4], 1)#cat是将通道数合并,就得到256+256=512,这里的特征P4_upsample和feat1是并列的 # 40, 40, 512 -> 40, 40, 512 P4 = self.conv3_for_downsample1(P4)#用C3网络保持输入大小不变,512---->512 # 40, 40, 512 -> 20, 20, 512 P4_downsample = self.down_sample2(P4) # 20, 20, 512 cat 20, 20, 512 -> 20, 20, 1024 P5 = torch.cat([P4_downsample, P5], 1) # 20, 20, 1024 -> 20, 20, 1024 P5 = self.conv3_for_downsample2(P5) # ---------------------------------------------------# # 第三个特征层 # y3=(batch_size,75,80,80) # ---------------------------------------------------# out2 = self.yolo_head_P3(P3) # ---------------------------------------------------# # 第二个特征层 # y2=(batch_size,75,40,40) # ---------------------------------------------------# out1 = self.yolo_head_P4(P4) # ---------------------------------------------------# # 第一个特征层 # y1=(batch_size,75,20,20) # ---------------------------------------------------# out0 = self.yolo_head_P5(P5) return out0, out1, out2 if __name__ == "__main__": #假设输入图片大小为640X640X640,Batch大小为4,处理后成4X3x640x640的tensor类型的矩阵,输入数据如下 input=torch.randn(4,3,640,640) #运行YoloBody网络 ss=YoloBody() #输出3个特征值,分别对应网络结构的head1,head2,head3 out0, out1, out2=ss(input) print(out0.shape, out1.shape, out2.shape)