import torch
from torch import nn
import torch.nn.functional as F
import math
class AttentionHead(nn.Module):
def __init__(self, embed_dim, head_dim):
super().__init__()
self.q = nn.Linear(embed_dim, head_dim)
self.k = nn.Linear(embed_dim, head_dim)
self.v = nn.Linear(embed_dim, head_dim)
self.dropout = nn.Dropout(0.2)
def forward(self, query, key, value, mask=None):
query, key, value = self.q(query), self.k(key), self.v(value)
scores = torch.bmm(query, key.transpose(1, 2)) / math.sqrt(query.size(-1))
if mask is not None:
mask = mask.unsqueeze(dim=1).repeat(1, mask.size(1), 1) # [batch_size, seq_len, seq_len]
assert scores.size() == mask.size()
scores = scores.masked_fill(mask == 0, -float("inf"))
weights = self.dropout(F.softmax(scores, dim=-1))
return torch.bmm(weights, value)
class SequentialMultiHeadAttention(nn.Module):
def __init__(self, hidden_size, num_heads):
super().__init__()
head_dim = hidden_size // num_heads
self.heads = nn.ModuleList(
[AttentionHead(hidden_size, head_dim) for _ in range(num_heads)]
)
self.output_linear = nn.Linear(hidden_size, hidden_size)
self.dropout = nn.Dropout(0.2)
def forward(self, query, key, value, attn_mask=None, query_mask=None, key_mask=None):
if query_mask is not None and key_mask is not None:
attn_mask = torch.bmm(query_mask.unsqueeze(-1), key_mask.unsqueeze(1))
x = torch.cat([h(query, key, value, attn_mask) for h in self.heads], dim=-1)
x = self.dropout(self.output_linear(x))
return x
class ScaledDotProductAttention(nn.Module):
def __init__<
pytorch 实现Transformer encoder
最新推荐文章于 2025-03-12 14:36:53 发布