这种分解方式也被叫做大核注意力(Large Kernel Attention),即LKA。如上图所示,一个很大kernel size的卷积被分解成一个Depth-wise卷积+一个Depth-wise空洞卷积+一个1× \times× 1卷积。这样,就可以大大减少FLOPs和参数量。很有效地解决了小核卷积的local性。
class AttentionModule(nn.Module):
def __init__(self, dim):
super().__init__()
self.conv0 = nn.Conv2d(dim, dim, 5, padding=2, groups=dim) #depth-wise conv
self.conv_spatial = nn.Conv2d(dim, dim, 7, stride=1, padding=9, groups=dim, dilation=3) #conv_spatial
self.conv1 = nn.Conv2d(dim, dim, 1) # 1x1 conv
def forward(self, x):
u = x.clone()
attn = self.conv0(x)
attn = self.conv_spatial(attn)
attn = self.conv1(attn)
return u * attn