序列模型pytorch的简单例子实现

clearsky767

已于 2023-04-04 15:35:32 修改

阅读量313

点赞数

分类专栏：深度学习文章标签：算法 transformer 深度学习 pytorch

于 2023-04-03 11:33:34 首次发布

本文链接：https://blog.csdn.net/clearsky767/article/details/129925932

版权

深度学习专栏收录该内容

21 篇文章 1 订阅

订阅专栏

简介

Pytorch中 nn.Transformer的使用详解与Transformer的黑盒讲解_iioSnail的博客-CSDN博客

代码

例子1

#coding=utf-8

import math
import random
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn


"""
https://blog.csdn.net/zhaohongfei_358/article/details/126019181

一些细节:
https://blog.csdn.net/zhaohongfei_358/article/details/122861751

https://zhuanlan.zhihu.com/p/360343417
https://zhuanlan.zhihu.com/p/389183195
https://zhuanlan.zhihu.com/p/398039366?utm_medium=social&utm_oi=629375409599549440

http://nlp.seas.harvard.edu/2018/04/03/attention.html
https://mp.weixin.qq.com/s/cY0IkHTpxS6x6cqsueXZIg
"""

"""
自注意力机制
"""
class SelfAttention(nn.Module):
    def __init__(self, input_vector_dim: int, dim_k=None, dim_v=None):
        """
        初始化SelfAttention, 包含如下关键参数:
        input_vector_dim: 输入向量的维度, 对应上述公式中的d, 例如你将单词编码为了10维的向量, 则该值为10
        dim_k: 矩阵W^k和W^q的维度
        dim_v: 输出向量的维度, 即b的维度, 例如, 经过Attention后的输出向量b, 如果你想让他的维度为15, 则该值为15, 若不填, 则取input_vector_dim
        """
        super(SelfAttention, self).__init__()

        self.input_vector_dim = input_vector_dim
        # 如果 dim_k 和 dim_v 为 None,则取输入向量的维度
        if dim_k is None:
            dim_k = input_vector_dim
        if dim_v is None:
            dim_v = input_vector_dim

        """
        实际写代码时,常用线性层来表示需要训练的矩阵,方便反向传播和参数更新
        """
        self.W_q = nn.Linear(input_vector_dim, dim_k, bias=False)
        self.W_k = nn.Linear(input_vector_dim, dim_k, bias=False)
        self.W_v = nn.Linear(input_vector_dim, dim_v, bias=False)

        # 这个是根号下d_k
        self._norm_fact = 1 / np.sqrt(dim_k)

    def forward(self, x):
        """
        进行前向传播：
        x: 输入向量, size为(batch_size, input_num, input_vector_dim)
        """
        # 通过W_q, W_k, W_v矩阵计算出,Q,K,V
        # Q,K,V矩阵的size为 (batch_size, input_num, output_vector_dim)
        Q = self.W_q(x)
        K = self.W_k(x)
        V = self.W_v(x)

        # permute用于变换矩阵的size中对应元素的位置,
        # 即,将K的size由(batch_size, input_num, output_vector_dim),变为(batch_size, output_vector_dim,input_num)
        # 0,1,2 代表各个元素的下标,即变换前,batch_size所在的位置是0,input_num所在的位置是1
        K_T = K.permute(0, 2, 1)

        # bmm是batch matrix-matrix product,即对一批矩阵进行矩阵相乘
        # bmm详情参见：https://pytorch.org/docs/stable/generated/torch.bmm.html
        atten = nn.Softmax(dim=-1)(torch.bmm(Q, K_T) * self._norm_fact)

        # 最后再乘以 V
        output = torch.bmm(atten, V)

        return output


def test_SelfAttention():
    model = SelfAttention(128, 32, 64)

    #定义50个为一批(batch_size=50),输入向量维度为128,一次输入5个向量,欲经过Attention层后,编码成5个64维的向量.
    x = torch.Tensor(50,5,128)
    y = model(x)
    print(y.size())

    print("test end.")


"""
多头自注意力机制
"""
def attention(query, key, value):
    """
    计算Attention的结果.
    这里其实传入的是Q,K,V,而Q,K,V的计算是放在模型中的,请参考后续的MultiHeadedAttention类.

    这里的Q,K,V有两种Shape,如果是Self-Attention,Shape为(batch, 词数, d_model),
                           例如(1, 7, 128),即batch_size为1,一句7个单词,每个单词128维

                           但如果是Multi-Head Attention,则Shape为(batch, head数, 词数,d_model/head数),
                           例如(1, 8, 7, 16),即Batch_size为1,8个head,一句7个单词,128/8=16.
                           这样其实也能看出来,所谓的MultiHead其实就是将128拆开了.

                           在Transformer中,由于使用的是MultiHead Attention,所以Q,K,V的Shape只会是第二种.

    """

    # 获取d_model的值.之所以这样可以获取,是因为query和输入的shape相同,
    # 若为Self-Attention,则最后一维都是词向量的维度,也就是d_model的值.
    # 若为MultiHead Attention,则最后一维是 d_model / h,h为head数
    d_k = query.size(-1)
    # 执行QK^T / √d_k
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)

    # 执行公式中的Softmax
    # 这里的p_attn是一个方阵
    # 若是Self Attention,则shape为(batch, 词数, 次数),例如(1, 7, 7)
    # 若是MultiHead Attention,则shape为(batch, head数, 词数,词数)
    p_attn = scores.softmax(dim=-1)

    # 最后再乘以 V.
    # 对于Self Attention来说,结果Shape为(batch, 词数, d_model),这也就是最终的结果了.
    # 但对于MultiHead Attention来说,结果Shape为(batch, head数, 词数,d_model/head数)
    # 而这不是最终结果,后续还要将head合并,变为(batch, 词数, d_model).不过这是MultiHeadAttention
    # 该做的事情.
    return torch.matmul(p_attn, value)


class MultiHeadAttention(nn.Module):
    def __init__(self, h, d_model):
        """
        h: head的数量
        """
        super(MultiHeadAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        # 定义W^q, W^k, W^v和W^o矩阵.
        # 如果你不知道为什么用nn.Linear定义矩阵,可以参考该文章：
        # https://blog.csdn.net/zhaohongfei_358/article/details/122797190
        self.linears = [
            nn.Linear(d_model, d_model),
            nn.Linear(d_model, d_model),
            nn.Linear(d_model, d_model),
            nn.Linear(d_model, d_model),
        ]

    def forward(self, x):
        # 获取Batch Size
        nbatches = x.size(0)

        """
        1. 求出Q, K, V,这里是求MultiHead的Q,K,V,所以Shape为(batch, head数, 词数,d_model/head数)
            1.1 首先,通过定义的W^q,W^k,W^v求出SelfAttention的Q,K,V,此时Q,K,V的Shape为(batch, 词数, d_model)
                对应代码为 `linear(x)`
            1.2 分成多头,即将Shape由(batch, 词数, d_model)变为(batch, 词数, head数,d_model/head数).
                对应代码为 `view(nbatches, -1, self.h, self.d_k)`
            1.3 最终交换“词数”和“head数”这两个维度,将head数放在前面,最终shape变为(batch, head数, 词数,d_model/head数).
                对应代码为 `transpose(1, 2)`
        """
        query, key, value = [
            linear(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
            for linear, x in zip(self.linears, (x, x, x))
        ]

        """
        2. 求出Q,K,V后,通过attention函数计算出Attention结果,
           这里x的shape为(batch, head数, 词数,d_model/head数)
           self.attn的shape为(batch, head数, 词数,词数)
        """
        x = attention(
            query, key, value
        )

        """
        3. 将多个head再合并起来,即将x的shape由(batch, head数, 词数,d_model/head数)
           再变为 (batch, 词数,d_model)
           3.1 首先,交换“head数”和“词数”,这两个维度,结果为(batch, 词数, head数, d_model/head数)
               对应代码为：`x.transpose(1, 2).contiguous()`
           3.2 然后将“head数”和“d_model/head数”这两个维度合并,结果为(batch, 词数,d_model)
        """
        x = (
            x.transpose(1, 2)
                .contiguous()
                .view(nbatches, -1, self.h * self.d_k)
        )

        # 最终通过W^o矩阵再执行一次线性变换,得到最终结果.
        return self.linears[-1](x)


def test_MultiHeadAttention():
    # 定义8个head，词向量维度为512
    model = MultiHeadAttention(8, 256)
    # 传入一个batch_size为2， 7个单词，每个单词为256维度
    x = torch.rand(2, 7, 256)
    # 输出Attention后的结果
    y = model(x)
    print(y.size())

    print("test end.")


def test_TransformerEncoder():
    # 定义8个head，词向量维度为512
    layer = nn.TransformerEncoderLayer(256,nhead=8,dim_feedforward=512,batch_first=True)
    model = nn.TransformerEncoder(layer,num_layers=2)
    # 传入一个batch_size为2， 7个单词，每个单词为256维度
    x = torch.rand(2, 7, 256)
    # 输出Attention后的结果
    y = model(x)
    print(y.size())

    print("test end.")


def test_TransformerDecoder():
    # 定义8个head，词向量维度为512
    #layer = nn.TransformerEncoderLayer(256,nhead=8,dim_feedforward=512,batch_first=True)
    layer = nn.TransformerDecoderLayer(d_model=256,nhead=8,dim_feedforward=512,batch_first=True)
    model = nn.TransformerDecoder(layer,num_layers=2)
    # 传入一个batch_size为2， 7个单词，每个单词为256维度
    x = torch.rand(2, 7, 256)
    tgt = torch.rand(2, 6, 256)
    # 输出Attention后的结果
    y = model(tgt,x)
    print(y.size())

    print("test end.")



def main():
    print("main")
    #test_SelfAttention()
    #test_MultiHeadAttention()
    #test_TransformerEncoder()
    test_TransformerDecoder()
    print("main end.")


if __name__ == '__main__':
    main()

例子2

#coding=utf-8

import math
import random
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn


"""
https://blog.csdn.net/zhaohongfei_358/article/details/126019181
"""


def test_nnTransformer():
    # 定义编码器，词典大小为10，要把token编码成128维的向量
    embedding = nn.Embedding(10, 128)
    # 定义transformer，模型维度为128（也就是词向量的维度）
    transformer = nn.Transformer(d_model=128, batch_first=True) # batch_first一定不要忘记
    # 定义源句子，可以想想成是 <bos> 我 爱 吃 肉 和 菜 <eos> <pad> <pad>
    src = torch.LongTensor([[0, 3, 4, 5, 6, 7, 8, 1, 2, 2]])
    # 定义目标句子，可以想想是 <bos> I like eat meat and vegetables <eos> <pad>
    tgt = torch.LongTensor([[0, 3, 4, 5, 6, 7, 8, 1, 2]])
    # 将token编码后送给transformer（这里暂时不加Positional Encoding）
    outputs = transformer(embedding(src), embedding(tgt))
    print(outputs.size())
    print("*"*50)
    print(outputs.shape)


def get_key_padding_mask(tokens):
    key_padding_mask = torch.zeros(tokens.size())
    key_padding_mask[tokens == 2] = -torch.inf
    return key_padding_mask

def test_nnTransformer2():
    src = torch.LongTensor([
        [0, 8, 3, 5, 5, 9, 6, 1, 2, 2, 2],
        [0, 6, 6, 8, 9, 1 ,2, 2, 2, 2, 2],
    ])
    tgt = torch.LongTensor([
        [0, 8, 3, 5, 5, 9, 6, 1, 2, 2],
        [0, 6, 6, 8, 9, 1 ,2, 2, 2, 2],
    ])

    tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(-1))
    print(tgt_mask)

    src_key_padding_mask = get_key_padding_mask(src)
    tgt_key_padding_mask = get_key_padding_mask(tgt)
    print(src_key_padding_mask)
    print("="*50)
    print(tgt_key_padding_mask)

    # 定义编码器，词典大小为10，要把token编码成128维的向量
    embedding = nn.Embedding(10, 128)
    # 定义transformer，模型维度为128（也就是词向量的维度）
    transformer = nn.Transformer(d_model=128, batch_first=True) # batch_first一定不要忘记
    # 将token编码后送给transformer（这里暂时不加Positional Encoding）
    outputs = transformer(embedding(src), embedding(tgt),
                        tgt_mask=tgt_mask,
                        src_key_padding_mask=src_key_padding_mask,
                        tgt_key_padding_mask=tgt_key_padding_mask)
    
    print(outputs.size())
    print("*"*50)
    print(outputs.shape)


class PositionalEncoding(nn.Module):
    "Implement the PE function."

    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # 初始化Shape为(max_len, d_model)的PE (positional encoding)
        pe = torch.zeros(max_len, d_model)
        # 初始化一个tensor [[0, 1, 2, 3, ...]]
        position = torch.arange(0, max_len).unsqueeze(1)
        # 这里就是sin和cos括号中的内容，通过e和ln进行了变换
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        # 计算PE(pos, 2i)
        pe[:, 0::2] = torch.sin(position * div_term)
        # 计算PE(pos, 2i+1)
        pe[:, 1::2] = torch.cos(position * div_term)
        # 为了方便计算，在最外面在unsqueeze出一个batch
        pe = pe.unsqueeze(0)
        # 如果一个参数不参与梯度下降，但又希望保存model的时候将其保存下来
        # 这个时候就可以用register_buffer
        self.register_buffer("pe", pe)

    def forward(self, x):
        """
        x 为embedding后的inputs,例如(1,7, 128),batch size为1,7个单词,单词维度为128.
        """
        # 将x和positional encoding相加。
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)

class MyTransformer(nn.Module):
    def __init__(self, d_model=128):
        super(MyTransformer, self).__init__()

        # 定义词向量，词典数为10。我们不预测两位小数。
        self.embedding = nn.Embedding(num_embeddings=10, embedding_dim=128)
        # 定义Transformer。超参是我拍脑袋想的
        self.transformer = nn.Transformer(d_model=128, num_encoder_layers=2, num_decoder_layers=2, dim_feedforward=512, batch_first=True)
        # 定义位置编码器
        self.positional_encoding = PositionalEncoding(d_model, dropout=0)
        # 定义最后的线性层,这里并没有用Softmax,因为没必要. 因为后面的CrossEntropyLoss中自带了
        self.predictor = nn.Linear(128, 10)

    def forward(self, src, tgt):
        # 生成mask
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size()[-1])
        src_key_padding_mask = MyTransformer.get_key_padding_mask(src)
        tgt_key_padding_mask = MyTransformer.get_key_padding_mask(tgt)

        # 对src和tgt进行编码
        src = self.embedding(src)
        tgt = self.embedding(tgt)
        # 给src和tgt的token增加位置信息
        src = self.positional_encoding(src)
        tgt = self.positional_encoding(tgt)

        # 将准备好的数据送给transformer
        out = self.transformer(src, tgt,
                               tgt_mask=tgt_mask,
                               src_key_padding_mask=src_key_padding_mask,
                               tgt_key_padding_mask=tgt_key_padding_mask)

        """
        这里直接返回transformer的结果。因为训练和推理时的行为不一样,
        所以在该模型外再进行线性层的预测.
        """
        return out

    @staticmethod
    def get_key_padding_mask(tokens):
        """
        用于key_padding_mask
        """
        key_padding_mask = torch.zeros(tokens.size())
        key_padding_mask[tokens == 2] = -torch.inf
        return key_padding_mask

#定义一个生成随机数据的函数,模拟句子来训练
def generate_random_batch(batch_size, max_length=16):
    src = []
    for i in range(batch_size):
        # 随机生成句子长度
        random_len = random.randint(1, max_length - 2)
        # 随机生成句子词汇，并在开头和结尾增加<bos>和<eos>
        random_nums = [0] + [random.randint(3, 9) for _ in range(random_len)] + [1]
        # 如果句子长度不足max_length，进行填充
        random_nums = random_nums + [2] * (max_length - random_len - 2)
        src.append(random_nums)
    src = torch.LongTensor(src)
    # tgt不要最后一个token
    tgt = src[:, :-1]
    # tgt_y不要第一个的token
    tgt_y = src[:, 1:]
    # 计算tgt_y，即要预测的有效token的数量
    n_tokens = (tgt_y != 2).sum()

    # 这里的n_tokens指的是我们要预测的tgt_y中有多少有效的token，后面计算loss要用
    return src, tgt, tgt_y, n_tokens


def test_train():
    max_length=16
    model = MyTransformer()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

    #train
    total_loss = 0
    for step in range(2000):
        # 生成数据
        src, tgt, tgt_y, n_tokens = generate_random_batch(batch_size=2, max_length=max_length)
        # 清空梯度
        optimizer.zero_grad()
        # 进行transformer的计算
        out = model(src, tgt)
        # 将结果送给最后的线性层进行预测
        out = model.predictor(out)
        """
        loss计算:
        由于训练时我们的是对所有的输出都进行预测, 所以需要对out进行reshape一下.
        我们的out的Shape为(batch_size, 词数, 词典大小), view之后变为: (batch_size*词数, 词典大小).
        而在这些预测结果中, 我们只需要对非<pad>部分进行, 所以需要进行正则化. 也就是除以n_tokens。
        """
        loss = criterion(out.contiguous().view(-1, out.size(-1)), tgt_y.contiguous().view(-1)) / n_tokens
        # 计算梯度
        loss.backward()
        # 更新参数
        optimizer.step()

        total_loss += loss

        # 每40次打印一下loss
        if step != 0 and step % 40 == 0:
            print("Step {}, total_loss: {}".format(step, total_loss))
            total_loss = 0

    #predict
    model = model.eval()
    # 随便定义一个src
    src = torch.LongTensor([[0, 4, 3, 4, 6, 8, 9, 9, 8, 1, 2, 2]])
    # tgt从<bos>开始，看看能不能重新输出src中的值
    tgt = torch.LongTensor([[0]])

    # 一个一个词预测，直到预测为<eos>，或者达到句子最大长度
    for i in range(max_length):
        # 进行transformer计算
        out = model(src, tgt)
        # 预测结果，因为只需要看最后一个词，所以取`out[:, -1]`
        predict = model.predictor(out[:, -1])
        # 找出最大值的index
        y = torch.argmax(predict, dim=1)
        # 和之前的预测结果拼接到一起
        tgt = torch.concat([tgt, y.unsqueeze(0)], dim=1)

        # 如果为<eos>，说明预测结束，跳出循环
        if y == 1:
            break

    print(tgt)

    print("test train end.")



def main():
    print("main")
    test_train()
    print("end.")


if __name__ == '__main__':
    main()

例子3

#coding=utf-8

import math
import random
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn


"""
下面网址里面的官网链接有相关代码
https://www.cnblogs.com/lfri/p/15044391.html

https://blog.csdn.net/adczsw/article/details/121719210
https://blog.csdn.net/SHU15121856/article/details/104448734
https://blog.csdn.net/weixin_44751294/article/details/124704785
注意是不是双向网络 则D为2
"""

#RNN单层例子
def test_RNNCell():
    rnn = nn.RNNCell(10, 20)
    input = torch.randn(5, 3, 10)
    hx = torch.randn(3, 20)
    output = []
    for i in range(input.size()[0]):
        hx = rnn(input[i], hx)
        output.append(hx)
    output = torch.stack(output, dim=0)

    print(output.shape)
    print(hx.shape)
    print("test end.")

def test_RNN():
    num_layers = 6
    rnn = nn.RNN(10, 20, num_layers)
    input = torch.randn(5, 3, 10)
    h0 = torch.randn(num_layers, 3, 20)
    output, hn = rnn(input, h0)

    print(output.shape)
    print(hn.shape)
    print("test end.")


def test_LSTMCell():
    rnn = nn.LSTMCell(10, 20)  # (input_size, hidden_size)
    input = torch.randn(5, 3, 10)  # (time_steps, batch, input_size)
    hx = torch.randn(3, 20)  # (batch, hidden_size)
    cx = torch.randn(3, 20)
    output = []
    for i in range(input.size()[0]):
        hx, cx = rnn(input[i], (hx, cx))
        output.append(hx)
    output = torch.stack(output, dim=0)

    print(output.shape)
    print(hx.shape)
    print(cx.shape)
    print("test end.")


def test_LSTM():
    num_layers = 6
    rnn = nn.LSTM(10, 20, num_layers)
    input = torch.randn(5, 3, 10)  # (time_steps, batch, input_size)
    h0 = torch.randn(num_layers, 3, 20)  # (D*num_layers, batch, hidden_size)          D：表示如果是双向网络, 就是2, 单向网络就是1
    c0 = torch.randn(num_layers, 3, 20)
    output, (hn, cn) = rnn(input, (h0, c0))  # output.shape : (time_steps, batch, D*hidden_size)   hn.shape : (num_layers, batch, D*hidden_size)

    print(output.shape)
    print(hn.shape)
    print(cn.shape)
    print("test end.")


def test_GRUCell():
    rnn = nn.GRUCell(10, 20)
    input = torch.randn(5, 3, 10)
    hx = torch.randn(3, 20)
    output = []
    for i in range(input.size()[0]):
        hx = rnn(input[i], hx)
        output.append(hx)
    output = torch.stack(output, dim=0)

    print(output.shape)
    print(hx.shape)
    print("test end.")

def test_GRU():
    num_layers = 6
    rnn = nn.GRU(10, 20, num_layers)
    input = torch.randn(5, 3, 10)
    h0 = torch.randn(num_layers, 3, 20)
    output, hn = rnn(input, h0)

    print(output.shape)
    print(hn.shape)
    print("test end.")


def main():
    print("main")
    #test_RNN()
    #test_RNNCell()
    #test_LSTM()
    #test_LSTMCell()
    #test_GRU()
    test_GRUCell()
    print("end.")


if __name__ == '__main__':
    main()