1. 点积注意力(Dot Product Attention)
import torch
class DotProductAttention(torch.nn.Module):
def forward(self, query, key, value):
scores = torch.matmul(query, key.transpose(-2, -1))
attention_weights = torch.nn.functional.softmax(scores, dim=-1)
output = torch.matmul(attention_weights, value)
return output, attention_weights
2. 缩放点积注意力(Scaled Dot Product Attention)
import torch
class ScaledDotProductAttention(torch.nn.Module):
def forward(self, query, key, value):
d_k = query.size(-1)
scores = torch.matmul(query, key.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
attention_weights = torch.nn.functional.softmax(scores, dim=-1)
output = torch.matmul(attention_weights, value)
return output, attention_weights
3. 自注意力(Self-Attention)
import torch
class SelfAttention(torch.nn.Module):
def __init__(self, embed_size, heads):
super(SelfAttention, self).__init__()
self.embed_size = embed_size
self.heads = heads
self.head_dim = embed_size // heads
assert self.head_dim * heads == embed_size, "Embedding size needs to be divisible by heads"
self.values = torch.nn.Linear(self.head_dim, self.head_dim, bias=False)
self.keys = torch.nn.Linear(self.head_dim, self.head_dim, bias=False)
self.queries = torch.nn.Linear(self.head_dim, self.head_dim, bias=False)
self.fc_out = torch.nn.Linear(heads * self.head_dim, embed_size)
def forward(self, values, keys, query, mask):
N = query.shape[0]
value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]
values = values.reshape(N, value_len, self.heads, self.head_dim)
keys = keys.reshape(N, key_len, self.heads, self.head_dim)
queries = query.reshape(N, query_len, self.heads, self.head_dim)
values = self.values(values)
keys = self.keys(keys)
queries = self.queries(queries)
energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys]) # batch matrix multiplication
if mask is not None:
energy = energy.masked_fill(mask == 0, float("-1e20"))
attention = torch.nn.functional.softmax(energy / (self.embed_size ** (1/2)), dim=3)
out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(N, query_len, self.heads * self.head_dim)
out = self.fc_out(out)
return out