PyTorch深度学习实战 | Transformer模型的实现

🌞欢迎来到深度学习实战的世界 
🌈博客主页:卿云阁

💌欢迎关注🎉点赞👍收藏⭐️留言📝

🌟本文由卿云阁原创!

📆首发时间:🌹2025年2月8日🌹

✉️希望可以和大家一起完成进阶之路!

🙏作者水平很有限,如果发现错误,请留言轰炸哦!万分感谢!


目录

位置编码和词向量

多头注意力机制

编码器

解码器

Transformer 整体结构

完整代码

位置编码和词向量

    就是在计算注意力权重时,引入表示相对距离的位置参数。

基于三角函数的绝对位置编码

     

                 在 Transformer 模型中,词向量(Word Embeddings)的表示是通过将输入的词嵌入(Embedding)和位置编码(Positional Encoding)相加得到的。

import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, emb_dim, max_len=5000):
        super(PositionalEncoding, self).__init__()

        # 创建一个 (max_len, emb_dim) 的矩阵
        pe = torch.zeros(max_len, emb_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # 位置索引
        div_term = torch.exp(torch.arange(0, emb_dim, 2).float() * (-math.log(10000.0) / emb_dim))  # 分母

        pe[:, 0::2] = torch.sin(position * div_term)  # 偶数位置用 sin
        pe[:, 1::2] = torch.cos(position * div_term)  # 奇数位置用 cos

        pe = pe.unsqueeze(0)  # 增加 batch 维度
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]  # 给输入加上位置信息

多头注意力机制

对词向量Xi分割成m个长度为d/m的子向量,对每一个子向量进行缩放点积。

q,k,v(因为X转换空间后,更加专注于注意力的学习)

掩码的注意力机制

在解码的时候,把i<j位置设置Mij=0即可

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, emb_dim, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        assert emb_dim % num_heads == 0  # 确保能被整除

        self.num_heads = num_heads
        self.head_dim = emb_dim // num_heads  # 每个头的维度
        self.scaling = self.head_dim ** -0.5  # 缩放因子

        self.qkv_proj = nn.Linear(emb_dim, emb_dim * 3)  # 线性变换 Q, K, V
        self.fc_out = nn.Linear(emb_dim, emb_dim)  # 输出层

    def forward(self, x):
        batch_size, seq_len, emb_dim = x.shape

        # 计算 Q, K, V
        qkv = self.qkv_proj(x).reshape(batch_size, seq_len, self.num_heads, 3 * self.head_dim)
        q, k, v = qkv.chunk(3, dim=-1)  # 拆分成 Q, K, V

        # 计算注意力分数
        attn_scores = torch.einsum("bqhd,bkhd->bhqk", q, k) * self.scaling  # (batch, heads, seq_len, seq_len)
        attn_probs = torch.softmax(attn_scores, dim=-1)  # 归一化

        # 加权求和
        attn_output = torch.einsum("bhqk,bkhd->bqhd", attn_probs, v)
        attn_output = attn_output.reshape(batch_size, seq_len, emb_dim)  # 变回原形状

        return self.fc_out(attn_output)  # 通过线性层输出

非线性层的设计(使用残差连接和层归一化)

第二个公式里面我认为还应该加上

最后的输出


编码器

class EncoderLayer(nn.Module):
    def __init__(self, emb_dim, num_heads, ffn_dim):
        super(EncoderLayer, self).__init__()
        self.attn = MultiHeadSelfAttention(emb_dim, num_heads)
        self.norm1 = nn.LayerNorm(emb_dim)
        self.ffn = nn.Sequential(
            nn.Linear(emb_dim, ffn_dim),
            nn.ReLU(),
            nn.Linear(ffn_dim, emb_dim)
        )
        self.norm2 = nn.LayerNorm(emb_dim)

    def forward(self, x):
        x = self.norm1(x + self.attn(x))  # 残差 + 归一化
        x = self.norm2(x + self.ffn(x))  # 前馈网络 + 归一化
        return x
class DecoderLayer(nn.Module):
    def __init__(self, emb_dim, num_heads, ffn_dim):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadSelfAttention(emb_dim, num_heads)
        self.norm1 = nn.LayerNorm(emb_dim)
        self.cross_attn = MultiHeadSelfAttention(emb_dim, num_heads)
        self.norm2 = nn.LayerNorm(emb_dim)
        self.ffn = nn.Sequential(
            nn.Linear(emb_dim, ffn_dim),
            nn.ReLU(),
            nn.Linear(ffn_dim, emb_dim)
        )
        self.norm3 = nn.LayerNorm(emb_dim)

    def forward(self, x, enc_output):
        x = self.norm1(x + self.self_attn(x))
        x = self.norm2(x + self.cross_attn(x, enc_output, enc_output))  # 关注 Encoder 输出
        x = self.norm3(x + self.ffn(x))
        return x

解码器

class DecoderLayer(nn.Module):
    def __init__(self, emb_dim, num_heads, ffn_dim):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadSelfAttention(emb_dim, num_heads)
        self.norm1 = nn.LayerNorm(emb_dim)
        self.cross_attn = MultiHeadSelfAttention(emb_dim, num_heads)
        self.norm2 = nn.LayerNorm(emb_dim)
        self.ffn = nn.Sequential(
            nn.Linear(emb_dim, ffn_dim),
            nn.ReLU(),
            nn.Linear(ffn_dim, emb_dim)
        )
        self.norm3 = nn.LayerNorm(emb_dim)

    def forward(self, x, enc_output):
        x = self.norm1(x + self.self_attn(x))
        x = self.norm2(x + self.cross_attn(x, enc_output, enc_output))  # 关注 Encoder 输出
        x = self.norm3(x + self.ffn(x))
        return x


Transformer 整体结构

class Transformer(nn.Module):
    def __init__(self, vocab_size, emb_dim, num_heads, ffn_dim, num_layers):
        super(Transformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.pos_encoder = PositionalEncoding(emb_dim)

        self.encoder = nn.Sequential(*[EncoderLayer(emb_dim, num_heads, ffn_dim) for _ in range(num_layers)])
        self.decoder = nn.Sequential(*[DecoderLayer(emb_dim, num_heads, ffn_dim) for _ in range(num_layers)])

        self.fc_out = nn.Linear(emb_dim, vocab_size)

    def forward(self, src, tgt):
        src = self.pos_encoder(self.embedding(src))
        tgt = self.pos_encoder(self.embedding(tgt))

        enc_output = self.encoder(src)
        dec_output = self.decoder(tgt, enc_output)

        return self.fc_out(dec_output)


完整代码

import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, emb_dim, max_len=5000):
        super(PositionalEncoding, self).__init__()

        # 创建一个 (max_len, emb_dim) 的矩阵
        pe = torch.zeros(max_len, emb_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # 位置索引
        div_term = torch.exp(torch.arange(0, emb_dim, 2).float() * (-math.log(10000.0) / emb_dim))  # 分母

        pe[:, 0::2] = torch.sin(position * div_term)  # 偶数位置用 sin
        pe[:, 1::2] = torch.cos(position * div_term)  # 奇数位置用 cos

        pe = pe.unsqueeze(0)  # 增加 batch 维度
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]  # 给输入加上位置信息
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, emb_dim, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        assert emb_dim % num_heads == 0  # 确保能被整除

        self.num_heads = num_heads
        self.head_dim = emb_dim // num_heads  # 每个头的维度
        self.scaling = self.head_dim ** -0.5  # 缩放因子

        self.qkv_proj = nn.Linear(emb_dim, emb_dim * 3)  # 线性变换 Q, K, V
        self.fc_out = nn.Linear(emb_dim, emb_dim)  # 输出层

    def forward(self, x):
        batch_size, seq_len, emb_dim = x.shape

        # 计算 Q, K, V
        qkv = self.qkv_proj(x).reshape(batch_size, seq_len, self.num_heads, 3 * self.head_dim)
        q, k, v = qkv.chunk(3, dim=-1)  # 拆分成 Q, K, V

        # 计算注意力分数
        attn_scores = torch.einsum("bqhd,bkhd->bhqk", q, k) * self.scaling  # (batch, heads, seq_len, seq_len)
        attn_probs = torch.softmax(attn_scores, dim=-1)  # 归一化

        # 加权求和
        attn_output = torch.einsum("bhqk,bkhd->bqhd", attn_probs, v)
        attn_output = attn_output.reshape(batch_size, seq_len, emb_dim)  # 变回原形状

        return self.fc_out(attn_output)  # 通过线性层输出
class EncoderLayer(nn.Module):
    def __init__(self, emb_dim, num_heads, ffn_dim):
        super(EncoderLayer, self).__init__()
        self.attn = MultiHeadSelfAttention(emb_dim, num_heads)
        self.norm1 = nn.LayerNorm(emb_dim)
        self.ffn = nn.Sequential(
            nn.Linear(emb_dim, ffn_dim),
            nn.ReLU(),
            nn.Linear(ffn_dim, emb_dim)
        )
        self.norm2 = nn.LayerNorm(emb_dim)

    def forward(self, x):
        x = self.norm1(x + self.attn(x))  # 残差 + 归一化
        x = self.norm2(x + self.ffn(x))  # 前馈网络 + 归一化
        return x
class DecoderLayer(nn.Module):
    def __init__(self, emb_dim, num_heads, ffn_dim):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadSelfAttention(emb_dim, num_heads)
        self.norm1 = nn.LayerNorm(emb_dim)
        self.cross_attn = MultiHeadSelfAttention(emb_dim, num_heads)
        self.norm2 = nn.LayerNorm(emb_dim)
        self.ffn = nn.Sequential(
            nn.Linear(emb_dim, ffn_dim),
            nn.ReLU(),
            nn.Linear(ffn_dim, emb_dim)
        )
        self.norm3 = nn.LayerNorm(emb_dim)

    def forward(self, x, enc_output):
        x = self.norm1(x + self.self_attn(x))
        x = self.norm2(x + self.cross_attn(x, enc_output, enc_output))  # 关注 Encoder 输出
        x = self.norm3(x + self.ffn(x))
        return x
class Transformer(nn.Module):
    def __init__(self, vocab_size, emb_dim, num_heads, ffn_dim, num_layers):
        super(Transformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.pos_encoder = PositionalEncoding(emb_dim)

        self.encoder = nn.Sequential(*[EncoderLayer(emb_dim, num_heads, ffn_dim) for _ in range(num_layers)])
        self.decoder = nn.Sequential(*[DecoderLayer(emb_dim, num_heads, ffn_dim) for _ in range(num_layers)])

        self.fc_out = nn.Linear(emb_dim, vocab_size)

    def forward(self, src, tgt):
        src = self.pos_encoder(self.embedding(src))
        tgt = self.pos_encoder(self.embedding(tgt))

        enc_output = self.encoder(src)
        dec_output = self.decoder(tgt, enc_output)

        return self.fc_out(dec_output)

# 设置 Transformer 参数
vocab_size = 7  # 假设中英词典大小都一样
emb_dim = 16  # 词向量维度
num_heads = 2  # 多头注意力
ffn_dim = 32  # 前馈神经网络隐藏层大小
num_layers = 2  # 编码器 & 解码器的层数


# 创建 Transformer 模型
model = Transformer(vocab_size, emb_dim, num_heads, ffn_dim, num_layers)
print(model)


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

卿云阁

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值