FasterNet
论文的核心贡献是提出了PConv,这是一种新的卷积操作,它通过减少部分输入通道的计算和内存访问来提高效率。此外,FasterNet是基于PConv构建的一系列网络,它们在保持准确性的同时,显著提高了运行速度。
论文标题为《Run, Don’t Walk: Chasing Higher FLOPS for Faster Neural Networks》
作者是Jierun Chen, Shiu-hong Kao, Hao He, Weipeng Zhuo, Song Wen, Chul-Ho Lee, S.-H. Gary Chan
作者分别来自香港科技大学(HKUST)、罗格斯大学(Rutgers University)和德克萨斯州立大学(Texas State University)。
摘要:
这篇论文探讨了如何设计快速神经网络的问题。许多现有工作集中在减少浮点运算次数(FLOPs)以提高网络速度,但作者发现FLOPs的减少并不一定导致延迟的相似程度减少。这主要是因为每秒浮点运算次数(FLOPS)效率低下。为了实现更快的网络,作者重新审视了流行的操作符,并展示了这种低FLOPS主要是由于操作符(特别是深度卷积)频繁的内存访问造成的。因此,作者提出了一种新颖的部分卷积(PConv),它通过减少冗余计算和内存访问来更有效地提取空间特征。基于PConv,作者进一步提出了FasterNet,这是一系列新的神经网络,它在各种设备上的速度明显快于其他网络,而不会牺牲各种视觉任务的准确性。例如,在ImageNet1k上,FasterNet-T0比MobileViT-XXS快2.8倍、3.3倍和2.4倍,分别在GPU、CPU和ARM处理器上,同时准确率提高了2.9%。FasterNet-L在GPU上的推理吞吐量比Swin-B高36%,在CPU上节省了37%的计算时间,同时达到了83.5%的top-1准确率。
引言:
神经网络在各种计算机视觉任务中迅速发展,如图像分类、检测和分割。为了追求快速神经网络,研究人员和实践者更倾向于设计计算复杂度低、延迟低、吞吐量高的神经网络。作者指出,为了实现快速网络,仅仅减少FLOPs是不够的,还需要提高FLOPS。
相关工作:
作者简要回顾了之前关于快速和高效神经网络的研究,并区分了这项工作与之前工作的不同之处。
PConv和FasterNet的设计:
作者首先重新审视了深度卷积(DWConv)并分析了其频繁内存访问的问题。然后,作者介绍了PConv作为解决该问题的有竞争力的替代操作符。接着,作者介绍了基于PConv构建的FasterNet,这是一个新的神经网络家族,它在各种设备上运行得非常快,并且对许多视觉任务非常有效。
实验结果:
论文中,作者首先检验了PConv的计算速度以及与PWConv结合使用时的有效性。然后,作者全面评估了FasterNet在分类、检测和分割任务上的性能。最后,作者进行了简要的消融研究。
读了论文后,自行搭建yolo11代码,并在\ultralytics-yolo11\ultralytics\nn\Extramodule路径增加FasterNet实现代码如下:
import torch
import torch.nn as nn
from timm.models.layers import DropPath, trunc_normal_
from functools import partial
from typing import List
from torch import Tensor
import copy
import os
class Partial_conv3(nn.Module):
def __init__(self, dim, n_div, forward):
super().__init__()
self.dim_conv3 = dim // n_div
self.dim_untouched = dim - self.dim_conv3
self.partial_conv3 = nn.Conv2d(self.dim_conv3, self.dim_conv3, 3, 1, 1, bias=False)
if forward == 'slicing':
self.forward = self.forward_slicing
elif forward == 'split_cat':
self.forward = self.forward_split_cat
else:
raise NotImplementedError
def forward_slicing(self, x: Tensor) -> Tensor:
# only for inference
x = x.clone() # !!! Keep the original input intact for the residual connection later
x[:, :self.dim_conv3, :, :] = self.partial_conv3(x[:, :self.dim_conv3, :, :])
return x
def forward_split_cat(self, x: Tensor) -> Tensor:
# for training/inference
x1, x2 = torch.split(x, [self.dim_conv3, self.dim_untouched], dim=1)
x1 = self.partial_conv3(x1)
x = torch.cat((x1, x2), 1)
return x
class MLPBlock(nn.Module):
def __init__(self,
dim,
n_div,
mlp_ratio,
drop_path,
layer_scale_init_value,
act_layer,
norm_layer,
pconv_fw_type
):
super().__init__()
self.dim = dim
self.mlp_ratio = mlp_ratio
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.n_div = n_div
mlp_hidden_dim = int(dim * mlp_ratio)
mlp_layer: List[nn.Module] = [
nn.Conv2d(dim, mlp_hidden_dim, 1, bias=False),
norm_layer(mlp_hidden_dim),
act_layer(),
nn.Conv2d(mlp_hidden_dim, dim, 1, bias=False)
]
self.mlp = nn.Sequential(*mlp_layer)
self.spatial_mixing = Partial_conv3(
dim,
n_div,
pconv_fw_type
)
if layer_scale_init_value > 0:
self.layer_scale = nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
self.forward = self.forward_layer_scale
else:
self.forward = self.forward
def forward(self, x: Tensor) -> Tensor:
shortcut = x
x = self.spatial_mixing(x)
x = shortcut + self.drop_path(self.mlp(x))
return x
def forward_layer_scale(self, x: Tensor) -> Tensor:
shortcut = x
x = self.spatial_mixing(x)
x = shortcut + self.drop_path(
self.layer_scale.unsqueeze(-1).unsqueeze(-1) * self.mlp(x))
return x
class BasicStage(nn.Module):
def __init__(self,
dim,
depth,
n_div,
mlp_ratio,
drop_path,
layer_scale_init_value,
norm_layer,
act_layer,
pconv_fw_type
):
super().__init__()
blocks_list = [
MLPBlock(
dim=dim,
n_div=n_div,
mlp_ratio=mlp_ratio,
drop_path=drop_path[i],
layer_scale_init_value=layer_scale_init_value,
norm_layer=norm_layer,
act_layer=act_layer,
pconv_fw_type=pconv_fw_type
)
for i in range(depth)
]
self.blocks = nn.Sequential(*blocks_list)
def forward(self, x: Tensor) -> Tensor:
x = self.blocks(x)
return x
class PatchEmbed(nn.Module):
def __init__(self, patch_size, patch_stride, in_chans, embed_dim, norm_layer):
super().__init__()
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_stride, bias=False)
if norm_layer is not None:
self.norm = norm_layer(embed_dim)
else:
self.norm = nn.Identity()
def forward(self, x: Tensor) -> Tensor:
x = self.norm(self.proj(x))
return x
class PatchMerging(nn.Module):
def __init__(self, patch_size2, patch_stride2, dim, norm_layer):
super().__init__()
self.reduction = nn.Conv2d(dim, 2 * dim, kernel_size=patch_size2, stride=patch_stride2, bias=False)
if norm_layer is not None:
self.norm = norm_layer(2 * dim)
else:
self.norm = nn.Identity()
def forward(self, x: Tensor) -> Tensor:
x = self.norm(self.reduction(x))
return x
class FasterNet(nn.Module):
def __init__(self,
in_chans=3,
num_classes=1000,
embed_dim=96,
depths=(1, 2, 8, 2),
mlp_ratio=2.,
n_div=4,
patch_size=4,
patch_stride=4,
patch_size2=2, # for subsequent layers
patch_stride2=2,
patch_norm=True,
feature_dim=1280,
drop_path_rate=0.1,
layer_scale_init_value=0,
norm_layer='BN',
act_layer='RELU',
fork_feat=True,
init_cfg=None,
pretrained=None,
pconv_fw_type='split_cat',
**kwargs):
super().__init__()
if norm_layer == 'BN':
norm_layer = nn.BatchNorm2d
else:
raise NotImplementedError
if act_layer == 'GELU':
act_layer = nn.GELU
elif act_layer == 'RELU':
act_layer = partial(nn.ReLU, inplace=True)
else:
raise NotImplementedError
if not fork_feat:
self.num_classes = num_classes
self.num_stages = len(depths)
self.embed_dim = embed_dim
self.patch_norm = patch_norm
self.num_features = int(embed_dim * 2 ** (self.num_stages - 1))
self.mlp_ratio = mlp_ratio
self.depths = depths
# split image into non-overlapping patches
self.patch_embed = PatchEmbed(
patch_size=patch_size,
patch_stride=patch_stride,
in_chans=in_chans,
embed_dim=embed_dim,
norm_layer=norm_layer if self.patch_norm else None
)
# stochastic depth decay rule
dpr = [x.item()
for x in torch.linspace(0, drop_path_rate, sum(depths))]
# build layers
stages_list = []
for i_stage in range(self.num_stages):
stage = BasicStage(dim=int(embed_dim * 2 ** i_stage),
n_div=n_div,
depth=depths[i_stage],
mlp_ratio=self.mlp_ratio,
drop_path=dpr[sum(depths[:i_stage]):sum(depths[:i_stage + 1])],
layer_scale_init_value=layer_scale_init_value,
norm_layer=norm_layer,
act_layer=act_layer,
pconv_fw_type=pconv_fw_type
)
stages_list.append(stage)
# patch merging layer
if i_stage < self.num_stages - 1:
stages_list.append(
PatchMerging(patch_size2=patch_size2,
patch_stride2=patch_stride2,
dim=int(embed_dim * 2 ** i_stage),
norm_layer=norm_layer)
)
self.stages = nn.Sequential(*stages_list)
self.fork_feat = fork_feat
self.forward = self.forward_det
# add a norm layer for each output
self.out_indices = [0, 2, 4, 6]
for i_emb, i_layer in enumerate(self.out_indices):
if i_emb == 0 and os.environ.get('FORK_LAST3', None):
raise NotImplementedError
else:
layer = norm_layer(int(embed_dim * 2 ** i_emb))
layer_name = f'norm{i_layer}'
self.add_module(layer_name, layer)
self.apply(self.cls_init_weights)
self.init_cfg = copy.deepcopy(init_cfg)
if self.fork_feat and (self.init_cfg is not None or pretrained is not None):
self.init_weights()
self.width_list = [i.size(1) for i in self.forward(torch.randn(1, 3, 640, 640))]
def cls_init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, (nn.Conv1d, nn.Conv2d)):
trunc_normal_(m.weight, std=.02)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, (nn.LayerNorm, nn.GroupNorm)):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
def forward_det(self, x: Tensor) -> Tensor:
# output the features of four stages for dense prediction
x = self.patch_embed(x)
outs = []
for idx, stage in enumerate(self.stages):
x = stage(x)
if self.fork_feat and idx in self.out_indices:
norm_layer = getattr(self, f'norm{idx}')
x_out = norm_layer(x)
outs.append(x_out)
return outs
然后修改对应的配置后,针对coco128进行训练,训练代码如下:
import warnings
warnings.filterwarnings('ignore')
from ultralytics import YOLO
if __name__ == '__main__':
model_name = 'yolo11-FasterNet' #-FasterNet VanillaNet
dataset_name = 'coco128'
model = YOLO(r'J:\PycharmProjects\ultralytics-yolo11\ultralytics\cfg\models\11\\' + model_name + '.yaml')
#model = (YOLO(r'J:\PycharmProjects\ultralytics-yolo11\ultralytics\cfg\models\11\yolo11-'+model_name+'.yaml').load('yolo11n.pt')) #从 YAML加载 然后再加载权重
model.train(data=r'J:\PycharmProjects\ultralytics-yolo11\ultralytics\cfg\datasets\\'+dataset_name+'.yaml',
cache=False,
imgsz=640,
epochs=100,
single_cls=False, # 是否是单类别检测
batch=4,
close_mosaic=10,
workers=0,
device='',
optimizer='SGD',
amp=True,
project='runs/train',
save = True,
name='exp_'+model_name+'_datasets_'+dataset_name,
)
训练结果
结论是比训练单独的yolo11.yaml网络参数是好那么一点点,但跟与训练模型pt相比还是差太远,毕竟是epoch100