- 🍨 本文为🔗365天深度学习训练营 中的学习记录博客
- 🍖 原作者:K同学啊 | 接辅导、项目定制
🏡我的环境:
- 语言环境:Python3.11.4
- 编译器:Jupyter Notebook
- torcch版本:2.0.1
1.多头注意力机制
import torch
import torch.nn as nn
import math
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
class MultiHeadAttention(nn.Module):
def __init__(self, hid_dim, n_heads):
super(MultiHeadAttention, self).__init__()
assert hid_dim % n_heads == 0
self.hid_dim = hid_dim
self.n_heads = n_heads
self.w_q = nn.Linear(hid_dim, hid_dim)
self.w_k = nn.Linear(hid_dim, hid_dim)
self.w_v = nn.Linear(hid_dim, hid_dim)
self.fc = nn.Linear(hid_dim, hid_dim)
self.scale = torch.sqrt(torch.FloatTensor([hid_dim // n_heads]))
def forward(self, query, key, value, mask=None):
bsz = query.shape[0]
Q = self.w_q(query)
K = self.w_k(key)
V = self.w_v(value)
Q = Q.view(bsz, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)
K = K.view(bsz, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)
V = V.view(bsz, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)
attention = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
if mask is not None:
attention = attention.masked_fill(mask == 0, -1e10)
attention = torch.softmax(attention, dim=-1)
x = torch.matmul(attention, V)
x = x.permute(0, 2, 1, 3).contiguous()
x = x.view(bsz, -1, self.n_heads * (self.hid_dim // self.n_heads))
x = self.fc(x)
return x
2.前馈传播
class Feedforward(nn.Module):
def __init__(self, d_model, d_ff, dropout=0.1):
super(Feedforward, self).__init__()
# 两层线性变换和ReLU激活函数
self.linear1 = nn.Linear(d_model, d_ff)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(d_ff, d_model)
def forward(self, x):
x = torch.nn.functional.relu(self.linear1(x))
x = self.dropout(x)
x = self.linear2(x)
return x
3.位置编码
class PositionalEncoding(nn.Module):
"实现位置编码"
def __init__(self, d_model, dropout, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
# 初始化一个Shape为(max_len, d_model)的位置编码矩阵
pe = torch.zeros(max_len, d_model).to(device)
# 初始化一个tensor [[0, 1, 2, 3, ...]]
position = torch.arange(0, max_len).unsqueeze(1)
# 这里计算sin和cos中的内容,通过e与ln进行变换
div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0) # 为了方便批处理,增加一个可以unsqueeze出一个`batch`
self.register_buffer('pe', pe)
def forward(self, x):
"""
将embedding后的inputs,例如(1, 7, 128),batch size为1, 7个单词,每个单词的嵌入为128
"""
# 将x与positional encoding相加。
x = x + self.pe[:, :x.size(1)].requires_grad_(False)
return self.dropout(x)
4.编码层
class EncoderLayer(nn.Module):
def __init__(self,d_model,n_heads,d_ff,dropout=0.1):
super(EncoderLayer,self).__init__()
self.self_attn = MultiHeadAttention(d_model,n_heads)
self.feedforward = Feedforward(d_model, d_ff, dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask):
# 自注意力机制
attn_output = self.self_attn(x, x, x, mask)
x = x + self.dropout(attn_output)
x = self.norm1(x)
# 前馈神经网络
ff_output = self.feedforward(x)
x = x + self.dropout(ff_output)
x = self.norm2(x)
return x
5.解码层
class DecoderLayer(nn.Module):
def __init__(self,d_model,n_heads,d_ff,dropout=0.1):
super(DecoderLayer, self).__init__()
# 自注意力层和前馈神经网络层初始化及残差连接和层归一化
self.self_attn = MultiHeadAttention(d_model, n_heads)
self.enc_attn = MultiHeadAttention(d_model, n_heads)
self.feedforward = Feedforward(d_model, d_ff, dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x, enc_output, self_mask, context_mask):
# 自注意力机制
attn_output = self.self_attn(x, x, x, self_mask)
x = x + self.dropout(attn_output)
x = self.norm1(x)
# 编码器-解码器注意力机制
attn_output = self.enc_attn(x, enc_output, enc_output, context_mask)
x = x + self.dropout(attn_output)
x = self.norm2(x)
# 前馈神经网络
ff_output = self.feedforward(x)
x = x + self.dropout(ff_output)
x = self.norm3(x)
return x
6.Transformer模型
class Transformer(nn.Module):
def __init__(self, vocab_size, d_model, n_heads, n_encoder_layers, n_decoder_layers, d_ff, dropout=0.1):
super(Transformer, self).__init__()
# Transformer模型包含嵌入层、位置编码、编码器和解码器层以及输出层
self.embedding = nn.Embedding(vocab_size, d_model)
self.positional_encoding = PositionalEncoding(d_model, dropout)
self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_encoder_layers)])
self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_decoder_layers)])
self.fc_out = nn.Linear(d_model, vocab_size)
self.dropout = nn.Dropout(dropout)
def forward(self, src, trg, src_mask, trg_mask):
# 嵌入层和位置编码
src = self.embedding(src)
src = self.positional_encoding(src)
trg = self.embedding(trg)
trg = self.positional_encoding(trg)
# 编码器层
for layer in self.encoder_layers:
src = layer(src, src_mask)
# 解码器层
for layer in self.decoder_layers:
trg = layer(trg, src, trg_mask, src_mask)
# 输出层
output = self.fc_out(trg)
return output
# 模型示例
vocab_size = 10000 # 假设词汇表大小为10000
d_model = 512
n_heads = 8
n_encoder_layers = 6
n_decoder_layers = 6
d_ff = 2048
dropout = 0.1
transformer_model = Transformer(vocab_size, d_model, n_heads, n_encoder_layers, n_decoder_layers, d_ff, dropout)
# 生成输入,这里的输入是随机的,需要根据实际情况修改
src = torch.randint(0, vocab_size, (32, 10)) # 假设源序列长度为10
trg = torch.randint(0, vocab_size, (32, 20)) # 假设目标序列长度为20
# 生成掩码,用于屏蔽序列中的填充位置
src_mask = (src != 0).unsqueeze(1).unsqueeze(2) # 源序列掩码
trg_mask = (trg != 0).unsqueeze(1).unsqueeze(2) # 目标序列掩码
# 模型前向传播
output = transformer_model(src, trg, src_mask, trg_mask)
print(output.shape)
7.小结
- Transformer直接基于attention构造
- SeqSeq与Transformer的关系,传统的Seq2Seq模型在处理长序列可能会遇到梯度消失/爆炸等问题,而Transformer模型的提出正是为了解决这些问题。Transformer的设计使其能够更好地处理长距离依赖关系,同时具有更好的并行性,因此在处理序列数据时取得很大的成功。Transformer是一种可以用于Seq2Seq任务的特定类型的模型。