Pytorch复现Transformer

第TR3周:Pytorch复现Transformer

在这里插入图片描述

1.多头注意力机制

在这里插入图片描述

import math
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class MultiHeadAttention(nn.Module):
    
    ## n_heads: 多头注意力的数量
    ## hid_dim: 每个词输出的向量维度
    
    def __init__(self, hid_dim, n_heads):
        super(MultiHeadAttention, self).__init__()
        self.hid_dim = hid_dim
        self.n_heads = n_heads 
        
        ## 强制hid_dim必须整除h
        assert hid_dim % n_heads == 0
        ## 定义 W_q 矩阵
        self.w_q = nn.Linear(hid_dim, hid_dim)
        ## 定义 W_k 矩阵
        self.w_k = nn.Linear(hid_dim, hid_dim)
        ## 定义 W_v 矩阵
        self.w_v = nn.Linear(hid_dim, hid_dim)
        self.fc = nn.Linear(hid_dim, hid_dim)
        ## 缩放
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim // n_heads]))
        
    def forward(self, query, key, value, mask=None):
        ## 注意Q,K,V在句子长度这一个维度的数值可以一样,可以不一样
        ## K:[64,10,300],batch_size:64,10个词,每个词Query向量是300维 
        ## V:[64,10,300],batch_size:64,10个词,每个词Query向量是300维
        ## Q:[64,12,300],batch_size:64,12个词,每个词Query向量是300维
        bsz = query.shape[0]     ## batch_size
        Q = self.w_q(query)
        K = self.w_k(key)
        V = self.w_v(value)
        ## 这里把 K Q V矩阵拆分为多组注意力
        ## 最后一维就是用self.hid_dim // self.n_heads来得到的,表示每组注意力的向量长度,每个head的向量长度是:300/6=50
        ## 64表示batch_size, 6表示有6组注意力,10表示有10个词,50表示每组注意力的词的向量长度
        ## K:[64,10,300]拆分多组注意力 -> [64, 10, 6, 50]转置得到 -> [64, 6, 10, 50]
        ## V:[64,10,300]拆分多组注意力 -> [64, 10, 6, 50]转置得到 -> [64, 6, 10, 50]
        ## Q:[64,12,300]拆分多组注意力 -> [64, 12, 6, 50]转置得到 -> [64, 6, 12, 50]
        ## 转置是为了把注意力的数量6放到前面,把10和50放到后面,方便计算
        Q = Q.view(bsz, -1, self.n_heads, self.hid_dim //
                   self.n_heads).permute(0, 2, 1, 3)    ## python中的-1 ??
        K = K.view(bsz, -1, self.n_heads, self.hid_dim //
                   self.n_heads).permute(0, 2, 1, 3)  
        V = V.view(bsz, -1, self.n_heads, self.hid_dim //
                   self.n_heads).permute(0, 2, 1, 3)
        
        ## 第一步:Q乘以K的转置,除以scale
        ## [64, 6, 12, 50] * [64, 6, 50, 10] = [64, 6, 12, 10]
        ## attention:[64, 6, 12, 10]
        attention = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        
        ## 如果 mask 不为空,那么就把mask为0的位置的attention分数设置为 -1e10,
        if mask is not None:
            attention = attention.masked_fill(mask == 0, -1e10)
            
            ## 第二步:计算上一步结果的softmax,再经过dropout,得到attention
            ## 注意,这里是对最后一维做softmax,也就是在输入序列维度做softmax
            ## attention: [64, 6, 12, 10]
            
        attention = torch.softmax(attention, dim = -1)    ## dim = -1??
        
        ## 第三步,attention结果与V相乘,得到多头注意力的结果
        ## [64, 6, 12, 10] * [64, 6, 10, 50] = [64, 6, 12, 50]
        ## x: [64, 6, 12, 50]
        x = torch.matmul(attention, V)
        
        ## 因为query有12个词,所以把12放到前面,把50和6放到后面,方便下面拼接多组的结果
        ## x:[64, 6, 12, 50]转置 -> [64, 12, 6, 50]
        x = x.permute(0, 2, 1, 3).contiguous()
        ## 这里的矩阵转换就是:把多头注意力的结果拼接起来
        ## 最终结果就是[64, 12, 300]
        ## x: [64, 12, 6, 50] -> [64, 12, 300]
        x = x.view(bsz, -1, self.n_heads * (self.hid_dim // self.n_heads))
        x = self.fc(x)
        return x

2.前馈传播

class Feedforward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(Feedforward, self).__init__()
        ## 两层线性映射和激活函数ReLU
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)
        
    def forward(self, x):
        x = torch.nn.functional.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        return x

3.位置编码

class PositionalEncoding(nn.Module):
    "实现位置编码"
    def __init__ (self, d_mdoel, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        ## 初始化Shape为(max_len, d_model)的PE(positional encoding)
        pe = torch.zeros(max_len, d_model).to(device)
        
        ## 初始化一个tensor[[0, 1, 2, 3, ...]]
        position = torch.arange(0, max_len).unsqueeze(1)
        ## 这里就是sin和cos括号中的内容,通过e和ln进行了变换
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)  ## 计算PE(pos, 2i)
        pe[:, 1::2] = torch.cos(position * div_term)  ## 计算PE(pos, 2i+1)
        
        pe = pe.unsqueeze(0)   ## 为了方便计算,在最外面在unsqueeze出一个batch
        
        ## 如果一个参数不参与梯度下降,但又希望保存model的时候将其保存下来
        ## 这个时候可以用register_buffer
        self.register_buffer("pe", pe)
        
    def forward(self, x):
        """
        x为embedding后的inputs,例如(1,7,128), batch size为1, 7个单词,单词维度为128
        
        """
        
        ## 将x和positional encoding相加
        x = x + self.pe[:, :x.size(1)].requires_grad_(False)
        return self.dropout(x)    

4.编码层

class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super(EncoderLayer, self).__init__()
        ## 编码层包含自注意力机制和前馈神经网络
        self.self_attn = MultiHeadAttention(d_model, n_heads)
        self.feedforward = Feedforward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        ## 自注意力机制
        attn_output = self.self_attn(x, x, x, mask)
        x = x + self.dropout(attn_output)
        x = self.norm1(x)
        
        ## 前馈神经网络
        ff_output = self.feedforward(x)
        x = x + self.dropout(ff_output)
        x = self.norm2(x)
        
        return x

5.解码层

class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super(DecoderLayer, self).__init__()
        ## 解码器包含自注意力机制,编码器-解码器注意力机制和前馈神经网络
        self.self_attn = MultiHeadAttention(d_model, n_heads)
        self.enc_attn = MultiHeadAttention(d_model, n_heads)
        self.feedforward = Feedforward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
        
    def forward(self, x, enc_output, self_mask, context_mask):
        ## 自注意力机制
        attn_output = self.self_attn(x, x, x, self_mask)
        x = x + self.dropout(attn_output)
        x = self.norm1(x)
        
        ## 编码器—解码器注意力机制
        attn_output = self.enc_attn(x, enc_output, enc_output, context_mask)
        x = x + self.dropout(attn_output)
        x = self.norm2(x)
                
        ## 前馈神经网络
        ff_output = self.feedforward(x)
        x = x + self.dropout(ff_output)
        x = self.norm3(x)
        
        return x

6.Transformer模型构建

class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, n_encoder_layers, n_decoder_layers, d_ff, dropout=0.1):
        super(Transformer, self).__init__()
        ## Transformer模型包含词嵌入、位置编码、编码器和解码器
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, dropout)
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_encoder_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_decoder_layers)])
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, trg, src_mask, trg_mask):
        ## 词嵌入和位置编码
        src = self.embedding(src)
        src = self.positional_encoding(src)
        trg = self.embedding(trg)
        trg = self.positional_encoding(trg)
        
        ## 编码器
        for layer in self.encoder_layers:
            src = layer(src, src_mask)
            
        ## 解码器
        for layer in self.decoder_layers:
            trg = layer(trg, src, trg_mask, src_mask)
            
        ## 输出层
        output = self.fc_out(trg)
        
        return output
## 使用示例
vocab_size = 10000      ## 假设词汇量大小为10000
d_model = 512
n_heads = 8
n_encoder_layers = 6
n_decoder_layers = 6
d_ff = 2048
dropout = 0.1

transformer_model = Transformer(vocab_size, d_model, n_heads, n_encoder_layers, n_decoder_layers, d_ff, dropout)
## 定义输出,这里输入是假设的,需要根据实际情况修改
src = torch.randint(0, vocab_size, (32, 10))   ## 源语言句子
trg = torch.randint(0, vocab_size, (32, 20))   ## 目标语言句子
src_mask = (src != 0).unsqueeze(1).unsqueeze(2) ## 掩码, 用于屏蔽填充的位置
trg_mask = (trg != 0).unsqueeze(1).unsqueeze(2) ## 掩码, 用于屏蔽填充的位置

## 模型前向传播
output = transformer_model(src, trg, src_mask, trg_mask)
print(output.shape)

torch.Size([32, 20, 10000])

总结

本周主要学习了解了Transformer,通过代码对于Transformer有了一个全面的了解,其中重点了解Transformer的自注意力机制。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值