参考
代码:https://github.com/kaijieshi7/Dynamic-convolution-Pytorch
博客:https://mp.weixin.qq.com/s/mZUSH7_7ysISoSMfxo4Vjw
就是原本的一个固定的卷积核,现在变为可以根据输入自适应改变注意力的卷积核
如上图所示,现在一个卷积核由K个卷积核决定,参数量大约上升了K倍(论文中说计算量并没有上升多少)
输入经过注意力提取,得到这k个卷积核的权重,线性加权起来就是新的一个卷积核
attention部分也就是普通的SENet那套,得到k个参数,不过要将原本的一个卷积核变为由k个卷积核加权的卷积核实现起来还是有坑,比如batchsize>1时,这个batch里的每个数据都需要不同的k个卷积核。
此时,attention和reshape后的kernel相乘,结果weight_size=[batch_size,out_plane*in_plane*kernel_size*kernel_size]
而x的shape为[batch_size,in_plane,height,width]
用分组卷积实现的话,weight_size的前两个维度融合,变成1个output_channel=batch_size*out_planes的卷积核,然后分batch_size组卷积,(输入x和kernel都会被分为batch_size组)即可实现每个sample对应不同的kernel
如果不用分组卷积,实际上就是不做前两个维度融合的操作,这样我们的卷积核weight_size=[batch_size, out_plane, in_plane, kernel_size, kernel_size],有5个维度,而用F.conv2d的话weigt参数要求是4个维度,而且是对所有x使用同样的一套卷积核,于是改用分组卷积来实现
以下为开头链接的代码,加了我自己的一些注释
1 import torch
2 import torch.nn as nn
3 import torch.nn.functional as F
4 import pdb
5 import os
6
7 os.environ['CUDA_VISIBLE_DEVICES'] = '7'
8 class attention2d(nn.Module):
9 def __init__(self,in_planes,ratio,K,temperature,init_weight=True):
10 super(attention2d, self).__init__()
11 assert temperature%3==1
12 self.avgpool = nn.AdaptiveAvgPool2d(1)
13 if in_planes != 3:
14 hidden_planes = int(in_planes*ratio)#这边是为了做一个bottleneck结构
15 else:
16 hidden_planes = K
17 self.fc1 = nn.Conv2d(in_planes, hidden_planes, 1, bias=False)
18 self.fc2 = nn.Conv2d(hidden_planes, K, 1, bias=False)
19 self.temperature = temperature
20 if init_weight:
21 self._initialize_weights()
22 def _initialize_weights(self):
23 for m in self.modules():
24 if isinstance(m, nn.Conv2d):#只有conv2d要初始化参数,fc不用吗
25 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
26 if m.bias is not None:
27 nn.init.constant_(m.bias, 0)#直接把bias初始化为0?
28 def update_temperature(self):
29 if self.temperature != 1:
30 self.temperature -= 3
31 print('Change temperature to:', str(self.temperature))
32 def forward(self, x):
33 x = self.avgpool(x)
34 pdb.set_trace()
35 x = self.fc1(x)
36 x = F.relu(x)
37 #pdb.set_trace()
38 x = self.fc2(x).view(x.size(0),-1)
39 return F.softmax(x/self.temperature, 1)
41 class Dynamic_conv2d(nn.Module):
42 def __init__(self, in_planes, out_planes, kernel_size, ratio=0.25, stride=1, padding=0, dilation=1, groups=1, bias=True, K=4, temperature=34, init_weight=True):
43 super(Dynamic_conv2d, self).__init__()
44 assert in_planes%groups==0#groups照理说应该是1啊,因为这里也不是真的senet,这个有啥用?
45 self.in_planes = in_planes
46 self.out_planes = out_planes
47 self.kernel_size = kernel_size
48 self.strid = stride
49 self.padding = padding
50 self.dilation = dilation
51 self.groups = groups
52 self.bias = bias
53 self.K = K
54 self.attention = attention2d(in_planes, ratio, K, temperature)
55
56 self.weight = nn.Parameter(torch.Tensor(K, out_planes, in_planes//groups, kernel_size, kernel_size), requires_grad=True)
57 if bias:
58 self.bias = nn.Parameter(torch.Tensor(K, out_planes))#这里初始化Bias
59 else:
60 self.bias = None
61 if init_weight:
62 self._initalize_weights()
63 def _initialize_weights(self):
64 for i in range(self.K):
65 nn.init.kaiming_uniform_(self.weight[i])#为什么这里是uniform,上面是normal?
66 def update_temperature(self):
67 self.attention.update_temperature()
68
69 def forward(self, x):
70 softmax_attention = self.attention(x)#(batch_size, K)
71 batch_size, in_planes, height, width = x.size()
72 x = x.view(1,-1,height,width)
73 weight = self.weight.view(self.K, -1)#(K, ***)
74 #将attention和weight相乘, 得到batchsize套融合后的kernel
75 aggregate_weight = torch.mm(softmax_attention, weight).view(-1, self.in_planes, self.kernel_size, self.kernel_size)#这里把batch_size和output_channel两个维度融合,为了后面直接用分组卷积,关键是后面这个view的操作
76 if self.bias is not None:
77 aggregate_bias = torch.mm( softmax_attention, self.bias).view(-1)#(batch_size,out_planes)->(b*o)
78 output = F.conv2d(x, weight=aggregate_weight, bias=aggregate_bias, stride=self.stride, padding=self.padding, dilation=self.dilation, groups=self.groups*batch_size)#关键在这里,与前面呼应
79 #若groups本身为1,则这里用分组卷积就是把上面的batch_size*output_channel又拆分开
80 #就是把input的x切分为batch_size组,kernel也分为Batch_size组,分别卷积,之后把结果concat起来
81 #这样每个sample对应的都是不同的kernel
82 else:
83 output = F.conv2d(x, weight=aggregate_weight, bias=None, stride=self.stride, padding=self.padding, dialtion=self.dilation, groups=self.groups*batch_size)
84 output = output.view(batch_size, self.out_planes, output.size(-2), outputsize(-1))
85 return output