self—attention-CSDN博客

本文链接：https://blog.csdn.net/qq_40149035/article/details/119823204

本文介绍了一种用于神经网络的Self-Attention层，通过Conv2d操作实现注意力机制，用于特征映射的加权聚合。它包括查询、关键和值的投影，softmax归一化和加权求和，展示了如何在前向传播中使用这些组件来提升模型性能。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

class Self_Attn(nn.Module):
    """ Self attention Layer"""

    def __init__(self, in_dim, activation):
        super(Self_Attn, self).__init__()
        self.chanel_in = in_dim
        self.activation = activation

        self.query_conv = nn.Conv2d(in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1)
        self.key_conv = nn.Conv2d(in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1)
        self.value_conv = nn.Conv2d(in_channels=in_dim, out_channels=in_dim, kernel_size=1)
        self.gamma = nn.Parameter(torch.zeros(1))

        self.softmax = nn.Softmax(dim=-1)  #

    def forward(self, x):
        """
            inputs :
                x : input feature maps( B X C X W X H)
            returns :
                out : self attention value + input feature
                attention: B X N X N (N is Width*Height)
        """
        m_batchsize, C, width, height = x.size()
        proj_query = self.query_conv(x).view(m_batchsize, -1, width * height).permute(0, 2, 1)  # B X N*C
        proj_key = self.key_conv(x).view(m_batchsize, -1, width * height)  # B X C x N(*W*H)
        energy = torch.bmm(proj_query, proj_key)  # transpose check  B*N*N
        attention = self.softmax(energy)  # BX (N) X (N)
        proj_value = self.value_conv(x).view(m_batchsize, -1, width * height)  # B X C X N

        out = torch.bmm(proj_value, attention.permute(0, 2, 1))
        out = out.view(m_batchsize, C, width, height)

        out = self.gamma * out + x
        return out, attention