参考博客:https://zhuanlan.zhihu.com/p/148737297
原理其实就是去计算每个词和其他词特征向量的相关性(特征向量和embedding差不多,不知道区别的话,就当成一个东西看)
主要有三个变量query key value
query和key主要是用来算embedding的相关性的,他们两个首先进行计算,得出所有embedding的相关性系数矩阵(权重矩阵)
value,原始的embedding
主要分为三步:
第一步,算出每个embedding和其他embedding的相关性,生成一个相关性矩阵
这个过程,大家都知道,其实embedding就是经过种种卷积或者其他操作后生成的一个新的特征向量,那么两个向量之间是可以计算相关性系数的,主要有以下方式:
cosine相关性系数
欧氏距离
点乘等等
第二步,对相关性系数矩阵进行softmax
softmax其实就是给重新将这个系数矩阵映射到0-1之间,类似于那种sigmod函数
第三步,与原始的embedding相乘,将该embedding与其他embedding生成的相关性系数乘进去,这样也就将该embedding与其他embedding的相关性集成了进去
代码实现网上有好多,粘下来几个,供大家参考
#self-attention
import torch.nn as nn
def calculate_distance_for(x,y):
batch_size=len(x)
row=len(x[0])
colum=len(y[0])
m = torch.zeros(batch_size,row, colum).to(device)
for i in range(batch_size):
for j in range(row):
for k in range(colum):
# torch.cosine_similarity(x[i],x[j])
d2=torch.dist(x[i][j] ,y[i][k], p=2)
m[i][j][k]=d2
return m
class SelfAttention(nn.Module):
def __init__(self, dropout, **kwargs):
super(SelfAttention, self).__init__(**kwargs)
self.dropout = nn.Dropout(dropout)
def forward(self, queries, keys, values):
#下面两句就是计算相关性,你也可以换成别的,类似上边我说的欧氏距离之类的
d = queries.shape[-1]
scores = torch.bmm(queries, keys.transpose(1,2)) / math.sqrt(d)
#欧氏距离:
#scores =calculate_distance_for(queries,keys)
#下面就是计算softmax,把所有的矩阵里面的数值重新映射,最后使得矩阵所有值加起来等于1
self.attention_weights = torch.softmax(scores, dim=2)
#下面就是将权重矩阵和原先的特征矩阵进行相乘,将该特征向量与其他特征向量的相关性聚合进去
return torch.bmm(self.dropout(self.attention_weights), values)
attention = SelfAttention(dropout=0.5)
# batch_size就是batchsize的大小,你每次往模型里面输几个数据
#num_queries其实就是你输进去的数据的长度,比如,你是NLP方向,那这个就是词的长度,你是计生方向,这个就是蛋白质序列长度
#num_hiddens这个其实是你特征向量的维度,也可以换成你的embedding的维度。就看你的self_attention在哪个层里面
batch_size, num_queries, num_hiddens = 2, 100, 21
#随机生成一个矩阵,维度是3,里面的每一维的含义,上边已经说了
X = torch.ones((batch_size, num_queries, num_hiddens))
#调用模型
ans = attention(X, X, X)
print(ans)
multihead attention(原理:将embedding划分成不同的块,对每块进行selfattention的操作):
def clones(module, N):
"Produce N identical layers."
return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
def attention(query, key, value, mask=None, dropout=None):
"Compute 'Scaled Dot Product Attention'"
d_k = query.size(-1)
scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
p_attn = F.softmax(scores, dim = -1)
if dropout is not None:
p_attn = dropout(p_attn)
return torch.matmul(p_attn, value), p_attn
def MLP(channels, batch_norm=True):
return Seq(*[
Seq(Lin(channels[i - 1], channels[i]), ReLU(), LN(channels[i]))
for i in range(1, len(channels))
])
class MultiHeadAttention(nn.Module):
def __init__(self, h, d_model, dropout=0.1):
"Take in model size and number of heads."
super(MultiHeadAttention, self).__init__()
assert d_model % h == 0
self.d_k = d_model // h
self.h = h
self.linears = clones(nn.Linear(d_model, d_model), 4) # create 4 linear layers
self.attn = None
self.dropout = nn.Dropout(p=dropout)
self.mlp = Seq(MLP([21,10]), Dropout(0.5), Lin(10, 1))
def forward(self, query, key, value, mask=None):
batch_size = query.size(0)
#划分成不同的子块,将不同的子块进行selfattention的操作
query, key, value = [l(x) for l, x in zip(self.linears, (query, key, value))]
query, key, value = [x.view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
for x in (query, key, value)] # (batch_size, h, seq_length, d_k)
x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)
x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)
out=self.linears[-1](x)
return self.mlp(out)
h = 7
d_model = 21 #特征维度
batch_size = 16
seq_length = 100 #序列长度
model = MultiHeadAttention(h, d_model)
query = torch.randn([batch_size, seq_length, d_model])
key = query
value = query
m = model(query, key, value)
print (m)