注意力机制——mask、加性注意力机制的代码讲解

小杨生煎001616

已于 2023-01-08 11:47:25 修改

阅读量2.3k

点赞数 3

文章标签： python 深度学习开发语言

于 2022-12-18 23:41:39 首次发布

本文链接：https://blog.csdn.net/yzj001616/article/details/128366982

版权

1.mask注意力评分函数

import math
import torch
from torch import nn
from d2l import torch as d2l

def masked_softmax(X, valid_lens):
    """通过在最后一个轴上掩蔽元素来执行softmax操作"""
    # X:3D张量，valid_lens:1D或2D张量
    if valid_lens is None:
        return nn.functional.softmax(X, dim=-1) # dim=-1代表以X的最后一个维度进行softmax，对于多维的X来说dim=-1相当于dim=2，即是对行进行softmax
    else:
        shape = X.shape
        if valid_lens.dim() == 1: 
            valid_lens = torch.repeat_interleave(valid_lens, shape[1]) # 把valid_lens转换成一个mask向量
        else:
            valid_lens = valid_lens.reshape(-1) # .reshape(-1)把张量拉成一维数组
        
        X = d2l.sequence_mask(X.reshape(-1, shape[-1]), valid_lens,
                              value=-1e6) # # 最后一轴上被掩蔽的元素使用一个非常大的负值替换，从而其softmax输出为0
        return nn.functional.softmax(X.reshape(shape), dim=-1)

以下是输入X与valid_lens的对应情况的三个例子：

2.加性注意力机制

加性注意力是处理keys和queries长度不一样的情况。

class AdditiveAttention(nn.Module):
    def __init__(self, key_size, query_size, num_hiddens, dropout, **kwargs):
        super(AdditiveAttention, self).__init__(**kwargs)
        self.W_k = nn.Linear(key_size, num_hiddens, bias=False) # bias是偏执b
        self.W_q = nn.Linear(query_size, num_hiddens, bias=False) # nn.Linear具体怎么操作看收藏
        self.w_v = nn.Linear(num_hiddens, 1, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, queries, keys, values, valid_lens):
        queries, keys = self.W_q(queries), self.W_k(keys)
        features = queries.unsqueeze(2) + keys.unsqueeze(1)
        features = torch.tanh(features)
        scores = self.w_v(features).squeeze(-1)
        self.attention_weights = masked_softmax(scores, valid_lens)
        return torch.bmm(self.dropout(self.attention_weights), values)

3.带入一个样例测试一下

queries, keys = torch.normal(0, 1, (2, 1, 20)), torch.ones((2, 10, 2))
values = torch.arange(40, dtype=torch.float32).reshape(1, 10, 4).repeat(
    2, 1, 1)
print(queries) #二维 一行 20列
print(keys)
print(values)
valid_lens = torch.tensor([2, 6])

attention = AdditiveAttention(key_size=2, query_size=20, num_hiddens=8, dropout=0.1)
attention.eval() # model.eval()的作用是 不启用 Batch Normalization 和 Dropout
# eval() 时，pytorch 会自动把 BN 和 DropOut 固定住，不会取平均，而是用训练好的值
attention(queries, keys, values, valid_lens)

 #然后画一个热力图
d2l.show_heatmaps(attention.attention_weights.reshape((1, 1, 2, 10)),
                  xlabel='Keys', ylabel='Queries')

涉及的torch函数用法：

1）torch.randn()函数，返回一个均值为0，方差为1的正态分布中填充随机数的张量

>>> torch.randn(4) # 一行四列
tensor([-2.1436,  0.9966,  2.3426, -0.6366])
>>> torch.randn(2,3) # 两行三列
tensor([[ 1.5954,  2.8929, -1.0923],
        [ 1.1719, -0.4709, -0.1996]])
>>> torch.randn(2,2,3) # 两维两行三列
tensor([[[-0.1687, -0.2883, -1.2846],
         [ 0.8579,  1.1618,  1.5979]],

        [[-1.2387, -0.7416, -0.4778],
         [-0.6276, -1.6339,  1.0678]]])

2）torch.nn.functional.Softmax()函数，计算张量的概率分布

对于一维的矩阵：

#nn.functional.softmax(X, dim)，dim=0:对X的列输出概率分布，dim=1：对X的行输出概率分布
x= nn.Tensor( [ [1,2,3,4],[1,2,3,4],[1,2,3,4]])

y1= nn.functional.softmax(x, dim = 0) #对每一列进行softmax
print(y1)
 
y2 = nn.functional.softmax(x,dim =1) #对每一行进行softmax
print(y2)
 
x1 = nn.Tensor([1,2,3,4])
print(x1)
 
y3 = nn.functional.softmax(x1,dim=0) #一维时使用dim=0，使用dim=1报错
print(y3)

#输出
tensor([[0.3333, 0.3333, 0.3333, 0.3333],
        [0.3333, 0.3333, 0.3333, 0.3333],
        [0.3333, 0.3333, 0.3333, 0.3333]])
tensor([[0.0321, 0.0871, 0.2369, 0.6439],
        [0.0321, 0.0871, 0.2369, 0.6439],
        [0.0321, 0.0871, 0.2369, 0.6439]])
tensor([1., 2., 3., 4.])
tensor([0.0321, 0.0871, 0.2369, 0.6439])

对于多维的矩阵：

import torch
import torch.nn.functional as F
input = torch.randn(2,2,3)
print(input)

m1 = F.softmax(input,dim=0) # 当dim=0时， 是对每一维度相同位置的数值进行softmax运算
print(m1)

m2 = F.softmax(input,dim=1) # 当dim=1时， 是对某一维度的列进行softmax运算
print(m2)

m3 = F.softmax(input,dim=2) # 当dim=2时， 是对某一维度的行进行softmax运算
print(m3)

m4 = F.softmax(input,dim=-1) # 当dim=-1时， 是对某一维度的行进行softmax运算
print(m4)

# 输出
tensor([[[-3.9332,  0.7909,  0.8927],
         [-1.7991,  0.2505,  0.7695]],

        [[ 0.1946,  0.1878,  1.2713],
         [ 0.9536,  1.0525, -0.7081]]])

tensor([[[0.0159, 0.6464, 0.4065],
         [0.0599, 0.3096, 0.8142]],

        [[0.9841, 0.3536, 0.5935],
         [0.9401, 0.6904, 0.1858]]])

tensor([[[0.1058, 0.6319, 0.5308],
         [0.8942, 0.3681, 0.4692]],

        [[0.3189, 0.2964, 0.8786],
         [0.6811, 0.7036, 0.1214]]])

tensor([[[0.0042, 0.4726, 0.5232],
         [0.0458, 0.3560, 0.5982]],

        [[0.2029, 0.2015, 0.5955],
         [0.4360, 0.4813, 0.0828]]])


tensor([[[0.0042, 0.4726, 0.5232],
         [0.0458, 0.3560, 0.5982]],

        [[0.2029, 0.2015, 0.5955],
         [0.4360, 0.4813, 0.0828]]])

3）X.shape、np.size(X,0/1)、X.shape[0]、X.shape[1]、X.shape[-1]、

X.shape返回张量X的形状、np.size(X,0/1)张量X的形状,0:输出行数,1:输出列数，没有值的话输出X的元素个数（.shape是属性，.size()是函数）

对于二维张量，shape[0]代表行数，shape[1]代表列数，同理三维张量还有shape[2]

对于图像来说：

image.shape[0]——图片高

image.shape[1]——图片长

image.shape[2]——图片通道数

而对于矩阵来说：

shape[0]：表示矩阵的行数

shape[1]：表示矩阵的列数

shape[-1]：一般来说，-1代表最后一个，所以shape[-1]代表最后一个维度，如在二维张量里，shape[-1]表示列数，在一维行向量，shape[-1]表示行向量的元素总数，换言之也是列数。

import numpy as np
a=np.array([0,1,2,3])
b=np.array([[0],[1],[2],[3]])
c=np.array([[0,1,2,3]])
print(a.shape)
print(b.shape)
print(c.shape)
print(np.size(c))
print(c.shape[0])
print(c.shape[-1])

# 输出
(4,)
(4, 1)
(1, 4)
4
1
4

4）None

和 False 不同，它不表示 0，也不表示空字符串，而表示没有值，也就是空值。可以看到，它属于 NoneType 类型，且None 是 NoneType 数据类型的唯一值。除此之外，None 常用于 assert、判断以及函数无返回值的情况。举个例子，我们一直使用 print () 函数输出数据，其实该函数的返回值就是 None。

5）torch.repeat_interleave()函数

dim=0，按行复制，dim=1，按列复制，没给出dim的值的话，就把a拉成一维数组复制。

a=torch.arange(10).view(2,5)
b=torch.repeat_interleave(a,3,dim=0)
c=torch.repeat_interleave(a,3,dim=1)
d=torch.repeat_interleave(a,3)
print(a)
print(b)
print(c)
print(d)

# 输出
tensor([[0, 1, 2, 3, 4],
        [5, 6, 7, 8, 9]])
tensor([[0, 1, 2, 3, 4],
        [0, 1, 2, 3, 4],
        [0, 1, 2, 3, 4],
        [5, 6, 7, 8, 9],
        [5, 6, 7, 8, 9],
        [5, 6, 7, 8, 9]])
tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4],
        [5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9]])
tensor([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7,
        8, 8, 8, 9, 9, 9])

6）X.reshape()

参数-1就是不知道行数或列数是多少的情况下使用的参数。

>>> X.shape
(209, 64, 64, 3)

>>> X.reshape(X.shape[0], -1)
(209, 64*64*3)

>>> a = torch.tensor([[1, 2, 3], [4, 5, 6]])
>>> a.reshape(-1, a.shape[-1])
>>> print(a)
tensor([[1, 2, 3],
        [4, 5, 6]])

7）torch.repeat_interleave()

torch.repeat_interleave(input, repeats, dim=None, *, output_size=None)

a = torch.arange(6).reshape(2,1,3)
res = torch.repeat_interleave(a,3,dim = 1) #张量a在第1维（行）上重复3遍
print(res)
print(a.shape)
print(res.shape)

运行结果：
tensor([[[0, 1, 2],
       [0, 1, 2],
        [0, 1, 2]],
       [[3, 4, 5],
        [3, 4, 5],
        [3, 4, 5]]])
torch.Size([2, 1, 3])
torch.Size([2, 3, 3])