Transformer 的 PyTorch 实现

本文主要介绍一下如何使用 PyTorch 复现 Transformer,实现简单的机器翻译任务。

数据预处理

这里我并没有用什么大型的数据集,而是手动输入了两对德语→英语的句子,还有每个字的索引也是我手动硬编码上去的,主要是为了降低代码阅读难度,我希望读者能更关注模型实现的部分

import math
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data

# S: Symbol that shows starting of decoding input
# E: Symbol that shows starting of decoding output
# P: Symbol that will fill in blank sequence if current batch data size is short than time steps
sentences = [
        # enc_input           dec_input         dec_output
        ['ich mochte ein bier P', 'S i want a beer .', 'i want a beer . E'],
        ['ich mochte ein cola P', 'S i want a coke .', 'i want a coke . E']
]

# Padding Should be Zero
src_vocab = {'P' : 0, 'ich' : 1, 'mochte' : 2, 'ein' : 3, 'bier' : 4, 'cola' : 5}
src_vocab_size = len(src_vocab)

tgt_vocab = {'P' : 0, 'i' : 1, 'want' : 2, 'a' : 3, 'beer' : 4, 'coke' : 5, 'S' : 6, 'E' : 7, '.' : 8}
idx2word = {i: w for i, w in enumerate(tgt_vocab)}
tgt_vocab_size = len(tgt_vocab)

src_len = 5 # enc_input max sequence length
tgt_len = 6 # dec_input(=dec_output) max sequence length

def make_data(sentences):
    enc_inputs, dec_inputs, dec_outputs = [], [], []
    for i in range(len(sentences)):
      enc_input = [[src_vocab[n] for n in sentences[i][0].split()]] # [[1, 2, 3, 4, 0], [1, 2, 3, 5, 0]]
      dec_input = [[tgt_vocab[n] for n in sentences[i][1].split()]] # [[6, 1, 2, 3, 4, 8], [6, 1, 2, 3, 5, 8]]
      dec_output = [[tgt_vocab[n] for n in sentences[i][2].split()]] # [[1, 2, 3, 4, 8, 7], [1, 2, 3, 5, 8, 7]]

      enc_inputs.extend(enc_input)
      dec_inputs.extend(dec_input)
      dec_outputs.extend(dec_output)

    return torch.LongTensor(enc_inputs), torch.LongTensor(dec_inputs), torch.LongTensor(dec_outputs)

enc_inputs, dec_inputs, dec_outputs = make_data(sentences)

class MyDataSet(Data.Dataset):
  def __init__(self, enc_inputs, dec_inputs, dec_outputs):
    super(MyDataSet, self).__init__()
    self.enc_inputs = enc_inputs
    self.dec_inputs = dec_inputs
    self.dec_outputs = dec_outputs
  
  def __len__(self):
    return self.enc_inputs.shape[0]
  
  def __getitem__(self, idx):
    return self.enc_inputs[idx], self.dec_inputs[idx], self.dec_outputs[idx]

loader = Data.DataLoader(MyDataSet(enc_inputs, dec_inputs, dec_outputs), 2, True)

模型参数

# Transformer Parameters
d_model = 512  # Embedding Size
d_ff = 2048 # FeedForward dimension
d_k = d_v = 64  # dimension of K(=Q), V
n_layers = 6  # number of Encoder of Decoder Layer
n_heads = 8  # number of heads in Multi-Head Attention

上面都比较简单,下面开始涉及到模型就比较复杂了,因此我会将模型拆分成以下几个部分进行讲解

  • Positional Encoding
  • Pad Mask(针对句子不够长,加了 pad,因此需要对 pad 进行 mask)
  • Subsequence Mask(Decoder input 不能看到未来时刻单词信息,因此需要 mask
  • ScaledDotProductAttention(计算 context vector
  • Multi-Head Attention
  • FeedForward Layer
  • Encoder Layer
  • Encoder
  • Decoder Layer
  • Decoder
  • Transformer

Positional Encoding

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        '''
        x: [seq_len, batch_size, d_model]
        '''
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

Pad Mask

def get_attn_pad_mask(seq_q, seq_k):
    '''
    seq_q: [batch_size, seq_len]
    seq_k: [batch_size, seq_len]
    seq_len could be src_len or it could be tgt_len
    seq_len in seq_q and seq_len in seq_k maybe not equal
    '''
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # [batch_size, 1, len_k], False is masked
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # [batch_size, len_q, len_k]

Subsequence Mask

def get_attn_subsequence_mask(seq):
    '''
    seq: [batch_size, tgt_len]
    '''
    attn_shape = [seq.size(0), seq.size(1), seq.size(1)]
    subsequence_mask = np.triu(np.ones(attn_shape), k=1) # Upper triangular matrix
    subsequence_mask = torch.from_numpy(subsequence_mask).byte()
    return subsequence_mask # [batch_size, tgt_len, tgt_len]

Subsequence Mask 只有 Decoder 会用到,主要作用是屏蔽未来时刻单词的信息。首先通过 np.ones()生成一个全 1 的方阵,然后通过 np.triu()生成一个上三角矩阵

ScaledDotProductAttention

class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        '''
        Q: [batch_size, n_heads, len_q, d_k]
        K: [batch_size, n_heads, len_k, d_k]
        V: [batch_size, n_heads, len_v(=len_k), d_v]
        attn_mask: [batch_size, n_heads, seq_len, seq_len]
        '''
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size, n_heads, len_q, len_k]
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is True.
        
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V) # [batch_size, n_heads, len_q, d_v]
        return context, attn

这里要做的是,通过QK计算出scores,然后将scoresV相乘,得到每个单词的context vector

class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads, bias=False)
        self.W_K = nn.Linear(d_model, d_k * n_heads, bias=False)
        self.W_V = nn.Linear(d_model, d_v * n_heads, bias=False)
        self.fc = nn.Linear(n_heads * d_v, d_model, bias=False)
    def forward(self, input_Q, input_K, input_V, attn_mask):
        '''
        input_Q: [batch_size, len_q, d_model]
        input_K: [batch_size, len_k, d_model]
        input_V: [batch_size, len_v(=len_k), d_model]
        attn_mask: [batch_size, seq_len, seq_len]
        '''
        residual, batch_size = input_Q, input_Q.size(0)
        # (B, S, D) -proj-> (B, S, D_new) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        Q = self.W_Q(input_Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # Q: [batch_size, n_heads, len_q, d_k]
        K = self.W_K(input_K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # K: [batch_size, n_heads, len_k, d_k]
        V = self.W_V(input_V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # V: [batch_size, n_heads, len_v(=len_k), d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size, n_heads, seq_len, seq_len]

        # context: [batch_size, n_heads, len_q, d_v], attn: [batch_size, n_heads, len_q, len_k]
        context, attn = ScaledDotProductAttention()(Q, K, V, attn_mask)
        context = context.transpose(1, 2).reshape(batch_size, -1, n_heads * d_v) # context: [batch_size, len_q, n_heads * d_v]
        output = self.fc(context) # [batch_size, len_q, d_model]
        return nn.LayerNorm(d_model).cuda()(output + residual), attn

完整代码中一定会有三处地方调用 MultiHeadAttention(),Encoder Layer 调用一次,传入的 input_Q、input_K、input_V 全部都是 enc_inputs;Decoder Layer 中两次调用,第一次传入的全是 dec_inputs,第二次传入的分别是 dec_outputs,enc_outputs,enc_outputs

FeedForward Layer

class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(d_model, d_ff, bias=False),
            nn.ReLU(),
            nn.Linear(d_ff, d_model, bias=False)
        )
    def forward(self, inputs):
        '''
        inputs: [batch_size, seq_len, d_model]
        '''
        residual = inputs
        output = self.fc(inputs)
        return nn.LayerNorm(d_model).cuda()(output + residual) # [batch_size, seq_len, d_model]

这段代码非常简单,就是做两次线性变换,残差连接后再跟一个 Layer Norm

Encoder Layer

class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, enc_inputs, enc_self_attn_mask):
        '''
        enc_inputs: [batch_size, src_len, d_model]
        enc_self_attn_mask: [batch_size, src_len, src_len]
        '''
        # enc_outputs: [batch_size, src_len, d_model], attn: [batch_size, n_heads, src_len, src_len]
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size, src_len, d_model]
        return enc_outputs, attn

Encoder

class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.src_emb = nn.Embedding(src_vocab_size, d_model)
        self.pos_emb = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])

    def forward(self, enc_inputs):
        '''
        enc_inputs: [batch_size, src_len]
        '''
        enc_outputs = self.src_emb(enc_inputs) # [batch_size, src_len, d_model]
        enc_outputs = self.pos_emb(enc_outputs.transpose(0, 1)).transpose(0, 1) # [batch_size, src_len, d_model]
        enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs) # [batch_size, src_len, src_len]
        enc_self_attns = []
        for layer in self.layers:
            # enc_outputs: [batch_size, src_len, d_model], enc_self_attn: [batch_size, n_heads, src_len, src_len]
            enc_outputs, enc_self_attn = layer(enc_outputs, enc_self_attn_mask)
            enc_self_attns.append(enc_self_attn)
        return enc_outputs, enc_self_attns

使用nn.ModuleList() 里面的参数是列表,列表里面存了n_layers个 Encoder Layer

由于我们控制好了 Encoder Layer 的输入和输出维度相同,所以可以直接用个 for 循环以嵌套的方式,将上一次 Encoder Layer 的输出作为下一次 Encoder Layer 的输入

Decoder Layer

class DecoderLayer(nn.Module):
    def __init__(self):
        super(DecoderLayer, self).__init__()
        self.dec_self_attn = MultiHeadAttention()
        self.dec_enc_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, dec_inputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask):
        '''
        dec_inputs: [batch_size, tgt_len, d_model]
        enc_outputs: [batch_size, src_len, d_model]
        dec_self_attn_mask: [batch_size, tgt_len, tgt_len]
        dec_enc_attn_mask: [batch_size, tgt_len, src_len]
        '''
        # dec_outputs: [batch_size, tgt_len, d_model], dec_self_attn: [batch_size, n_heads, tgt_len, tgt_len]
        dec_outputs, dec_self_attn = self.dec_self_attn(dec_inputs, dec_inputs, dec_inputs, dec_self_attn_mask)
        # dec_outputs: [batch_size, tgt_len, d_model], dec_enc_attn: [batch_size, h_heads, tgt_len, src_len]
        dec_outputs, dec_enc_attn = self.dec_enc_attn(dec_outputs, enc_outputs, enc_outputs, dec_enc_attn_mask)
        dec_outputs = self.pos_ffn(dec_outputs) # [batch_size, tgt_len, d_model]
        return dec_outputs, dec_self_attn, dec_enc_attn

在 Decoder Layer 中会调用两次 MultiHeadAttention,第一次是计算 Decoder Input 的 self-attention,得到输出dec_outputs。然后将 dec_outputs作为生成 Q 的元素,enc_outputs 作为生成 K 和 V 的元素,再调用一次 MultiHeadAttention,得到的是 Encoder 和 Decoder Layer 之间的 context vector。最后将 dec_outptus 做一次维度变换,然后返回

Decoder

class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.tgt_emb = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_emb = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)])

    def forward(self, dec_inputs, enc_inputs, enc_outputs):
        '''
        dec_inputs: [batch_size, tgt_len]
        enc_intpus: [batch_size, src_len]
        enc_outputs: [batsh_size, src_len, d_model]
        '''
        dec_outputs = self.tgt_emb(dec_inputs) # [batch_size, tgt_len, d_model]
        dec_outputs = self.pos_emb(dec_outputs.transpose(0, 1)).transpose(0, 1).cuda() # [batch_size, tgt_len, d_model]
        dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs).cuda() # [batch_size, tgt_len, tgt_len]
        dec_self_attn_subsequence_mask = get_attn_subsequence_mask(dec_inputs).cuda() # [batch_size, tgt_len, tgt_len]
        dec_self_attn_mask = torch.gt((dec_self_attn_pad_mask + dec_self_attn_subsequence_mask), 0).cuda() # [batch_size, tgt_len, tgt_len]

        dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs) # [batc_size, tgt_len, src_len]

        dec_self_attns, dec_enc_attns = [], []
        for layer in self.layers:
            # dec_outputs: [batch_size, tgt_len, d_model], dec_self_attn: [batch_size, n_heads, tgt_len, tgt_len], dec_enc_attn: [batch_size, h_heads, tgt_len, src_len]
            dec_outputs, dec_self_attn, dec_enc_attn = layer(dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)
            dec_self_attns.append(dec_self_attn)
            dec_enc_attns.append(dec_enc_attn)
        return dec_outputs, dec_self_attns, dec_enc_attns

Decoder 中不仅要把 "pad"mask 掉,还要 mask 未来时刻的信息,因此就有了下面这三行代码,其中 torch.gt(a, value) 的意思是,将 a 中各个位置上的元素和 value 比较,若大于 value,则该位置取 1,否则取 0

dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs) # [batch_size, tgt_len, tgt_len]
        dec_self_attn_subsequence_mask = get_attn_subsequence_mask(dec_inputs) # [batch_size, tgt_len, tgt_len]
        dec_self_attn_mask = torch.gt((dec_self_attn_pad_mask + dec_self_attn_subsequence_mask), 0) # [batch_size, tgt_len, tgt_len]

Transformer

class Transformer(nn.Module):
    def __init__(self):
        super(Transformer, self).__init__()
        self.encoder = Encoder().cuda()
        self.decoder = Decoder().cuda()
        self.projection = nn.Linear(d_model, tgt_vocab_size, bias=False).cuda()
    def forward(self, enc_inputs, dec_inputs):
        '''
        enc_inputs: [batch_size, src_len]
        dec_inputs: [batch_size, tgt_len]
        '''
        # tensor to store decoder outputs
        # outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)
        
        # enc_outputs: [batch_size, src_len, d_model], enc_self_attns: [n_layers, batch_size, n_heads, src_len, src_len]
        enc_outputs, enc_self_attns = self.encoder(enc_inputs)
        # dec_outpus: [batch_size, tgt_len, d_model], dec_self_attns: [n_layers, batch_size, n_heads, tgt_len, tgt_len], dec_enc_attn: [n_layers, batch_size, tgt_len, src_len]
        dec_outputs, dec_self_attns, dec_enc_attns = self.decoder(dec_inputs, enc_inputs, enc_outputs)
        dec_logits = self.projection(dec_outputs) # dec_logits: [batch_size, tgt_len, tgt_vocab_size]
        return dec_logits.view(-1, dec_logits.size(-1)), enc_self_attns, dec_self_attns, dec_enc_attns

Transformer 主要就是调用 Encoder 和 Decoder。最后返回 dec_logits 的维度是 [batch_size * tgt_len, tgt_vocab_size],可以理解为,一个句子,这个句子有 batch_size*tgt_len 个单词,每个单词有 tgt_vocab_size种情况,取概率最大者

模型 & 损失函数 & 优化器

model = Transformer().cuda()
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.99)

这里的损失函数里面我设置了一个参数 ignore_index=0,因为 “pad” 这个单词的索引为 0,这样设置以后,就不会计算 “pad” 的损失(因为本来 “pad” 也没有意义,不需要计算)

训练

for epoch in range(30):
    for enc_inputs, dec_inputs, dec_outputs in loader:
      '''
      enc_inputs: [batch_size, src_len]
      dec_inputs: [batch_size, tgt_len]
      dec_outputs: [batch_size, tgt_len]
      '''
      enc_inputs, dec_inputs, dec_outputs = enc_inputs.cuda(), dec_inputs.cuda(), dec_outputs.cuda()
      # outputs: [batch_size * tgt_len, tgt_vocab_size]
      outputs, enc_self_attns, dec_self_attns, dec_enc_attns = model(enc_inputs, dec_inputs)
      loss = criterion(outputs, dec_outputs.view(-1))
      print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss))

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

测试

def greedy_decoder(model, enc_input, start_symbol):
    """
    For simplicity, a Greedy Decoder is Beam search when K=1. This is necessary for inference as we don't know the
    target sequence input. Therefore we try to generate the target input word by word, then feed it into the transformer.
    Starting Reference: http://nlp.seas.harvard.edu/2018/04/03/attention.html#greedy-decoding
    :param model: Transformer Model
    :param enc_input: The encoder input
    :param start_symbol: The start symbol. In this example it is 'S' which corresponds to index 4
    :return: The target input
    """
    enc_outputs, enc_self_attns = model.encoder(enc_input)
    dec_input = torch.zeros(1, tgt_len).type_as(enc_input.data)
    next_symbol = start_symbol
    for i in range(0, tgt_len):
        dec_input[0][i] = next_symbol
        dec_outputs, _, _ = model.decoder(dec_input, enc_input, enc_outputs)
        projected = model.projection(dec_outputs)
        prob = projected.squeeze(0).max(dim=-1, keepdim=False)[1]
        next_word = prob.data[i]
        next_symbol = next_word.item()
    return dec_input

# Test
enc_inputs, _, _ = next(iter(loader))
greedy_dec_input = greedy_decoder(model, enc_inputs[0].view(1, -1).cuda(), start_symbol=tgt_vocab["S"])
predict, _, _, _ = model(enc_inputs[0].view(1, -1).cuda(), greedy_dec_input)
predict = predict.data.max(1, keepdim=True)[1]
print(enc_inputs[0], '->', [idx2word[n.item()] for n in predict.squeeze()])

参考文章:
https://wmathor.com/index.php/archives/1455/

  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
### 回答1: Transformer是一种基于自注意力机制的神经网络模型,用于处理序列到序列的任务,如机器翻译、文本摘要等。PyTorch是一个流行的深度学习框架,提供了实现Transformer模型的工具和库。使用PyTorch实现Transformer模型可以方便地进行模型训练和调试,并且可以利用PyTorch的自动求导功能来优化模型参数。 ### 回答2: Transformer是一种用于序列建模的深度学习模型,它可以用于自然语言处理中的机器翻译、文本分类、语言模型等任务。它的设计思路是利用注意力机制来捕捉输入序列之间的关系。 PyTorch是一种基于Python的优秀的深度学习框架。在PyTorch中,可以使用预定义的模型类来实现Transformer模型。Transformer模型在PyTorch框架中实现的方法主要分为两种:自定义层和PyTorch自带模块。 自定义层 在PyTorch中,借助于nn.Module和nn.Parameter类,可以轻松地定义自己的模型层。下面是一个例子: ``` import torch import torch.nn as nn import torch.nn.functional as F class MultiHeadAttention(nn.Module): def __init__(self, d_model, heads): super().__init__() self.d_model = d_model self.heads = heads assert d_model % heads == 0 self.d_k = d_model // heads self.q_linear = nn.Linear(d_model, d_model) self.v_linear = nn.Linear(d_model, d_model) self.k_linear = nn.Linear(d_model, d_model) self.out = nn.Linear(d_model, d_model) def forward(self, q, k, v, mask=None): bs = q.size(0) q = self.q_linear(q).view(bs, -1, self.heads, self.d_k) k = self.k_linear(k).view(bs, -1, self.heads, self.d_k) v = self.v_linear(v).view(bs, -1, self.heads, self.d_k) q = q.permute(0, 2, 1, 3) k = k.permute(0, 2, 1, 3) v = v.permute(0, 2, 1, 3) scores = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float32)) if mask is not None: mask = mask.unsqueeze(1).repeat(1, self.heads, 1, 1) scores = scores.masked_fill(mask == 0, -1e9) scores = F.softmax(scores, dim=-1) attention = torch.matmul(scores, v) attention = attention.permute(0, 2, 1, 3).contiguous() attention = attention.view(bs, -1, self.heads * self.d_k) return self.out(attention) ``` 此处定义了一个MultiHeadAttention类,并在初始化函数中定义各个线性层,而forward函数则为模型的前向传递代码。 其中,MultiHeadAttention中的q、k、v分别表示查询、键和值的输入张量,mask为特殊的掩码,用于限制注意力机制只看前面的信息。在forward函数中,我们首先把输入张量传递到各自的线性层中,然后按照头数分割,为每个头初始化查询、键和值(使用view函数),然后使用softmax归一化注意力分布,最后用权重矩阵与值矩阵的乘积形成输出。最后我们将头合并,返回输出张量。 这样,我们就可以通过自定义层的方式来定义Transformer模型。需要注意的是,在整个模型中,每一个自定义层应该加一次Layer Normalization。 使用PyTorch自带模块 除了使用自定义层,PyTorch还提供了一些预定义的模块类,用于模型的构建。下面是一个使用PyTorch自带模块搭建的Transformer模型: ``` import torch import torch.nn as nn import torch.nn.functional as F from torch.autograd import Variable class MultiHeadAttention(nn.Module): def __init__(self, d_model, heads): super().__init__() self.d_model = d_model self.heads = heads assert d_model % heads == 0 self.d_k = d_model // heads self.qkv = nn.Linear(d_model, 3 * d_model) self.out = nn.Linear(d_model, d_model) def forward(self, q, k, v, mask=None): bs = q.size(0) qkv = self.qkv(torch.cat([q, k, v], dim=-1)) qkv = qkv.view(bs, -1, self.heads, 3 * self.d_k).transpose(1, 2) q, k, v = qkv[:, :, :, :self.d_k], qkv[:, :, :, self.d_k:2*self.d_k], qkv[:, :, :, 2*self.d_k:] scores = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float32)) if mask is not None: mask = mask.unsqueeze(1).repeat(1, self.heads, 1, 1) scores = scores.masked_fill(mask == 0, -1e9) scores = F.softmax(scores, dim=-1) attention = torch.matmul(scores, v) attention = attention.transpose(1, 2).contiguous().view(bs, -1, self.heads * self.d_k) return self.out(attention) class PositionwiseFeedForward(nn.Module): def __init__(self, d_model, hidden_dim): super().__init__() self.fc1 = nn.Linear(d_model, hidden_dim) self.fc2 = nn.Linear(hidden_dim, d_model) def forward(self, x): return self.fc2(F.relu(self.fc1(x))) class Normalization(nn.Module): def __init__(self, d_model): super().__init__() self.d_model = d_model self.alpha = nn.Parameter(torch.ones(self.d_model)) self.bias = nn.Parameter(torch.zeros(self.d_model)) def forward(self, x): norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) / (x.std(dim=-1, keepdim=True) + 1e-6) + self.bias return norm class EncoderLayer(nn.Module): def __init__(self, d_model, heads, hidden_dim): super().__init__() self.attention = MultiHeadAttention(d_model=d_model, heads=heads) self.norm1 = Normalization(d_model=d_model) self.dropout1 = nn.Dropout(0.5) self.feed_forward = PositionwiseFeedForward(d_model=d_model, hidden_dim=hidden_dim) self.norm2 = Normalization(d_model=d_model) self.dropout2 = nn.Dropout(0.5) def forward(self, x, mask=None): x2 = self.attention(x, x, x, mask=mask) x = self.norm1(x + self.dropout1(x2)) x2 = self.feed_forward(x) x = self.norm2(x + self.dropout2(x2)) return x class Encoder(nn.Module): def __init__(self, d_model, heads, hidden_dim, num_layers): super().__init__() self.layers = nn.ModuleList([ EncoderLayer(d_model=d_model, heads=heads, hidden_dim=hidden_dim) for _ in range(num_layers) ]) def forward(self, src, mask=None): for layer in self.layers: src = layer(src, mask=mask) return src class DecoderLayer(nn.Module): def __init__(self, d_model, heads, hidden_dim): super().__init__() self.attention1 = MultiHeadAttention(d_model=d_model, heads=heads) self.norm1 = Normalization(d_model=d_model) self.dropout1 = nn.Dropout(0.5) self.attention2 = MultiHeadAttention(d_model=d_model, heads=heads) self.norm2 = Normalization(d_model=d_model) self.dropout2 = nn.Dropout(0.5) self.feed_forward = PositionwiseFeedForward(d_model=d_model, hidden_dim=hidden_dim) self.norm3 = Normalization(d_model=d_model) self.dropout3 = nn.Dropout(0.5) def forward(self, x, memory, src_mask=None, tgt_mask=None): x2 = self.attention1(x, x, x, mask=tgt_mask) x = self.norm1(x + self.dropout1(x2)) x2 = self.attention2(x, memory, memory, mask=src_mask) x = self.norm2(x + self.dropout2(x2)) x2 = self.feed_forward(x) x = self.norm3(x + self.dropout3(x2)) return x class Decoder(nn.Module): def __init__(self, d_model, heads, hidden_dim, num_layers): super().__init__() self.layers = nn.ModuleList([ DecoderLayer(d_model=d_model, heads=heads, hidden_dim=hidden_dim) for _ in range(num_layers) ]) def forward(self, tgt, memory, src_mask=None, tgt_mask=None): for layer in self.layers: tgt = layer(tgt, memory, src_mask=src_mask, tgt_mask=tgt_mask) return tgt class Transformer(nn.Module): def __init__(self, d_model, heads, hidden_dim, num_layers, src_vocab_size, tgt_vocab_size, max_length): super().__init__() self.encoder = Encoder(d_model=d_model, heads=heads, hidden_dim=hidden_dim, num_layers=num_layers) self.decoder = Decoder(d_model=d_model, heads=heads, hidden_dim=hidden_dim, num_layers=num_layers) self.src_embedding = nn.Embedding(src_vocab_size, d_model) self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model) self.out = nn.Linear(d_model, tgt_vocab_size) self.max_length = max_length def make_src_mask(self, src): src_mask = (src != 0) return src_mask def make_tgt_mask(self, tgt): tgt_pad_mask = (tgt != 0) tgt_len = tgt.shape[1] tgt_sub_mask = torch.tril(torch.ones((tgt_len, tgt_len))) tgt_mask = tgt_pad_mask.unsqueeze(1) & tgt_sub_mask return tgt_mask def forward(self, src, tgt): src_mask = self.make_src_mask(src) tgt_mask = self.make_tgt_mask(tgt) src_embedded = self.src_embedding(src) tgt_embedded = self.tgt_embedding(tgt) memory = self.encoder(src_embedded, mask=src_mask) output = self.decoder(tgt_embedded, memory, src_mask=src_mask, tgt_mask=tgt_mask) output = self.out(output) return output ``` 与自定义层类似,在PyTorch实现Transformer模型也借助于nn.Module和nn.Parameter类定义自己的模型层。上述代码中,分别定义了MultiHeadAttention、PositionwiseFeedForward、Normalization、EncoderLayer、Encoder、DecoderLayer、DecoderTransformer八个类,一共分为Encoder、DecoderTransformer三部分。 对于Transformer模型而言,Encoder有若干个EncoderLayer层,每个EncoderLayer层中有一个MultiHeadAttention层和一个PositionwiseFeedForward层,而Decoder中也有若干个DecoderLayer层,每个DecoderLayer层中有两个MultiHeadAttention层和一个PositionwiseFeedForward层。在Encoder和Decoder的代码中,还分别添加了make_src_mask和make_tgt_mask函数,用于生成掩码。 最后,我们使用Transformer类将Encoder和Decoder组合在一起,并实现整个模型的前向传递。在前向传递的过程中,我们需要先通过词向量嵌入层将输入编码,然后在Encoder中将编码的输入信息进行处理,并在Decoder中将编码信息解码,最终通过输出层得到输出。整个模型都是基于PyTorch的自带模块组合而成的。 综上所述,通过自定义层或者利用PyTorch自带模块,我们可以很容易地实现Transformer模型,并使用PyTorch框架进行训练和预测等操作。 ### 回答3: transformer是自然语言处理领域一种重要的模型,它在机器翻译、文本生成、文本分类等任务中都有广泛的应用。PyTorch是一种流行的深度学习框架,它能够帮助我们更加方便地实现各种深度学习算法,包括transformertransformer模型的核心是自注意力机制,它可以让模型在处理序列数据时能够自动地关注到重要的信息。具体来说,transformer的自注意力机制包含了三个部分:查询(Q)、键(K)和值(V)。每个部分都是向量,其中查询向量表示我们希望关注到的信息,而键向量和值向量则表示序列中的每个位置都包含的信息。通过计算查询向量和所有键向量之间的相似度,我们可以得到一个权重向量,用来表示每个位置对于查询向量的重要程度。然后,我们可以将重要程度和对应位置的值向量加权求和,得到自注意力机制的输出。 在PyTorch实现transformer模型,我们可以借助官方提供的transformer模块,只需要定义好模型的输入、输出、层数等超参数,就能够很方便地搭建一个transformer模型。下面是一个实现transformer模型的样例代码: import torch.nn as nn import torch.nn.functional as F from torch.nn import TransformerEncoder, TransformerEncoderLayer class TransformerModel(nn.Module): def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5): super(TransformerModel, self).__init__() self.pos_encoder = PositionalEncoding(ninp, dropout) encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout) self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers) self.encoder = nn.Embedding(ntoken, ninp) self.ninp = ninp self.decoder = nn.Linear(ninp, ntoken) self.init_weights() def init_weights(self): initrange = 0.1 self.encoder.weight.data.uniform_(-initrange, initrange) self.decoder.bias.data.zero_() self.decoder.weight.data.uniform_(-initrange, initrange) def forward(self, src, src_mask): src = self.encoder(src) * math.sqrt(self.ninp) src = self.pos_encoder(src) output = self.transformer_encoder(src, src_mask) output = self.decoder(output) return output 其中,我们使用了PositionalEncoding模块来对输入的序列进行位置编码,EncoderLayer模块实现transformer的一个编码层,Encoder模块则包含了多个编码层,组成了整个transformer模型。在forward函数中,我们首先对输入进行嵌入和位置编码操作,然后使用transformer编码器进行编码,最后通过线性层得到模型的输出。 总之,PyTorch提供了方便的transformer模块实现方式,我们只需要定义好模型的超参数和组件,就可以快速搭建出一个强大的transformer模型来处理不同的NLP任务。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值