Transformer

EnjoyFailure

已于 2023-10-31 19:51:28 修改

阅读量49

点赞数

分类专栏： NLP 文章标签： transformer 深度学习人工智能

于 2023-10-29 22:56:56 首次发布

本文链接：https://blog.csdn.net/S_5922/article/details/134101699

版权

NLP 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

模型结构图

Q（query）、K（key）、V（value）矩阵的含义：

Q,K,V是由输入的词向量x经过线性变换得到的,其中各个矩阵w可以经过学习得到, 这种变换可以提升模型的拟合能力, 得到的Q,K,V 可以理解为
Q: 要查询的信息
K: 被查询的向量
V: 查询得到的值

总结:
首先Q、K、V都源于输入特征本身，是根据输入特征产生的向量，但目前我们现在无需关注是如何产生这组向量的。
V可以看做表示单个输入特征的向量。当我们直接把一组V输入到网络中进行训练，那这个网络就是没有引入Attention机制的网络。
但如果引入Attention，就需要将这组V分别乘以一组权重α \alphaα，那么就可以做到有重点性地关注输入特征，如同人的注意力一般。

层归一化（Layer Normalization）：是一种特征缩放技术，用于稳定深度神经网络的训练。它在每个样本的所有特征上进行归一化，使得输出的均值为0，标准差为1。这可以帮助网络更好地学习不同特征的相对重要性，并防止梯度爆炸或消失。帮助模型更好的收敛

位置编码（PositionEmbedding）：因为encoder的输入是并行的，因此词与词之间并没有位置信息，加入位置编码，可给输入增加一个位置信息

代码实现

data.py

import numpy as np
import random
import torch
from torch.utils import data

# 定义x的字典
dict_x = '<SOS>,<EOS>,<PAD>,0,1,2,3,4,5,6,7,8,9,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z'
dict_x = {word:i for i,word in enumerate(dict_x.split(','))}
dict_word = [k for k, v in dict_x.items()]
# 定义y的字典
dict_y = {word.upper():i for word,i in dict_x.items()}
dict_word = [k for k,v in dict_y.items()]

# 获取输入数据x与对应的y
def get_data():
    # 输入的数据
    words = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'q', 'w', 'e', 'r',
             't', 'y', 'u', 'i', 'o', 'p', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k',
             'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm']
    # 每个数据对应的概率
    p = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                  13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26])
    p = p/p.sum()
    # 随机取n个词
    n = random.randint(30,48)
    x = np.random.choice(words, size=n, replace=True, p=p)# replace参数为是否可以重复抽样
    x = x.tolist()
    # 定义y的取值
    def f(i):
        i = i.upper()
        if not i.isdigit():
            return i
        i = 9-int(i)
        return str(i)
    y = [f(i) for i in x]
    y += [y[-1]]# 双写最后一个
    y = y[::-1]# 将y逆序
    # 对x，y进行处理，加首尾和补pad
    x = ['<SOS>']+x+['<EOS>']
    y = ['<SOS>']+y+['<EOS>']
    x += ['<PAD>']*50
    y += ['<PAD>']*51
    x = x[:50]
    y = y[:51]
    # 编码为模型的输入数据
    x = [dict_x[i] for i in x]
    y = [dict_y[i] for i in y]
    x = torch.tensor(x)
    y = torch.tensor(y)
    
    return x,y

# 定义数据集
class Dataset(data.Dataset):
    def __init__(self):
        super().__init__()

    def __len__(self):
        return 100000
    
    def __getitem__(self, index):
        return get_data()
loader = data.DataLoader(dataset=Dataset(), batch_size=8, shuffle=True)

util.py

import torch
import math

from torch.serialization import SourceChangeWarning

# 计算注意力函数
def attention(Q, K, V, mask):
    # b句话,每句话50个词,每个词编码成32维向量,4个头,每个头分到8维向量
    # Q,K,V = [b, 4, 50, 8]
    
    # [b, 4, 50, 8] * [b, 4, 8, 50] -> [b, 4, 50, 50]
    # Q,K矩阵相乘,求每个词相对其他所有词的注意力
    score = torch.matmul(Q, K.permute(0,1,3,2))
    # 除以每个头维数的平方根,做数值缩放,8是每个头的维度
    score /= 8**0.5
    # mask遮盖,mask是true的地方都被替换成-inf,这样在计算softmax的时候,-inf会被压缩到0
    # mask = [b, 1, 50, 50]
    score = score.masked_fill(mask, -float('inf'))
    score = torch.softmax(score, dim=-1)
    # 以注意力分数乘以V,得到最终的注意力结果
    # [b, 4, 50, 50] * [b, 4, 50, 8] -> [b, 4, 50, 8]
    score = torch.matmul(score, V)
    # 每个头计算的结果合一
    # [b, 4, 50, 8] -> [b, 50, 32]
    score = score.permute(0, 2, 1, 3).reshape(-1, 50, 32)

    return score

# 多头注意力层
class MultiHeadAttention(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc_Q = torch.nn.Linear(32, 32)
        self.fc_K = torch.nn.Linear(32, 32)
        self.fc_V = torch.nn.Linear(32, 32)
        self.fc_out = torch.nn.Linear(32, 32)# 输出层，将维数转化为与embedding的维数相同
        self.norm = torch.nn.LayerNorm(normalized_shape=32, elementwise_affine=True)# 第一个参数表示需要归一化的维度，第二个参数表示具备可学习参数w和b
        self.dropout = torch.nn.Dropout(p=0.1)# 防止过拟合
    
    def forward(self, Q, K, V, mask):
        # b句话,每句话50个词,每个词编码成32维向量
        # Q,K,V = [b, 50, 32]
        b = Q.shape[0]
        # 保留下原始的Q,后面要做短接用
        clone_Q = Q.clone()
        # 先对QKV做归一化，这与论文的顺序不一致，实验证明，先做归一化的效果会更好
        Q = self.norm(Q)
        K = self.norm(K)
        V = self.norm(V)
        # 线性运算,得到WQ, WK, WV
        # [b, 50, 32] -> [b, 50, 32]
        Q = self.fc_Q(Q)
        K = self.fc_K(K)
        V = self.fc_V(V)
        # 拆分成多个头
        # b句话,每句话50个词,每个词编码成32维向量,4个头,每个头分到8维向量
        Q = Q.reshape(b, 50, 4, 8).permute(0, 2, 1, 3)
        K = K.reshape(b, 50, 4, 8).permute(0, 2, 1, 3)
        V = V.reshape(b, 50, 4, 8).permute(0, 2, 1, 3)
        # 计算注意力
        # [b, 4, 50, 8] -> [b, 50, 32]
        score = attention(Q, K, V, mask)
        # 计算输出,维度不变
        # [b, 50, 32] -> [b, 50, 32]
        score = self.dropout(self.fc_out(score))
        # 短接
        score += clone_Q

        return score

# 位置编码层,输入未进行编码的x，返回加了编码和位置编码的x
class PositionEmbedding(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # pos是第几个词,i是第几个维度,d_model是维度总数
        def get_pe(pos, i, d_model):
            fm = 1e4**(i/d_model)
            pe = pos/fm
            if 0==i%2:
                return math.sin(pe)
            return math.cos(pe)
        # 初始化位置编码矩阵
        pe = torch.empty(50, 32)
        for i in range(50):
            for j in range(32):
                pe[i, j] = get_pe(i, j, 32)
        pe = pe.unsqueeze(0)# 添加一个维度 [1, 50, 32]
        # 定义为不更新的常量
        self.register_buffer('pe', pe)# 在训练的时候，不更新
        # 词编码层
        self.embed = torch.nn.Embedding(39, 32)# 第一次参数表示字典里有多少个词，第二个参数为embedding的维度
        self.embed.weight.data.normal_(1, 0.1)# 初始化参数

    def forward(self, x):
        # [8, 50] -> [8, 50, 32]
        embed = self.embed(x)
        # 词编码和位置编码相加
        # [8, 50, 32] + [1, 50, 32] -> [8, 50, 32]
        embed += self.pe
        
        return embed

# 全连接输出层
class FullyConnectedOutput(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Sequential(torch.nn.Linear(32,64), torch.nn.ReLU(), torch.nn.Linear(64,32), torch.nn.Dropout(p=0.1))
        self.norm = torch.nn.LayerNorm(normalized_shape=32, elementwise_affine=True)

    def forward(self, x):
        clone_x = x.clone()
        x = self.norm(x)
        # 线性全连接运算
        # [b, 50, 32] -> [b, 50, 32]
        output = self.fc(x)
        #短接
        output += clone_x

        return output

mask.py

from data import dict_x, dict_y
import torch

def mask_pad(data):
    # b句话,每句话50个词,这里是还没embed的
    # data = [b, 50]
    # 判断每个词是不是<PAD>
    mask = data==dict_x['<PAD>']
    # [b, 50] -> [b, 1, 1, 50]
    mask = mask.reshape(-1, 1, 1, 50)
    # 在计算注意力时,是计算50个词和50个词相互之间的注意力,所以是个50*50的矩阵
    # 是pad的列是true,意味着任何词对pad的注意力都是0
    # 但是pad本身对其他词的注意力并不是0
    # 所以是pad的行不是true

    # 复制n次
    # [b, 1, 1, 50] -> [b, 1, 50, 50]
    mask = mask.expand(-1, 1, 50, 50)

    return mask

def mask_tril(data):
    # b句话,每句话50个词,这里是还没embed的
    # data = [b, 50]
    # 50*50的矩阵表示每个词对其他词是否可见
    # 上三角矩阵,不包括对角线,意味着,对每个词而言,他只能看到他自己,和他之前的词,而看不到之后的词
    # [1, 50, 50]
    """
    [[0, 1, 1, 1, 1],
     [0, 0, 1, 1, 1],
     [0, 0, 0, 1, 1],
     [0, 0, 0, 0, 1],
     [0, 0, 0, 0, 0]]"""
    tril = 1- torch.tril(torch.ones(1, 50, 50, dtype=torch.long))# 生成一个上三角矩阵，torch.tril函数生成一个下三角矩阵
    mask = data==dict_y['<PAD>']
    # 变形+转型,为了之后的计算
    # [b, 1, 50]
    mask = mask.unsqueeze(1).long()
    # mask和tril求并集
    # [b, 1, 50] + [1, 50, 50] -> [b, 50, 50] 此处用到了传播机制
    mask = mask+tril
     # 转化为布尔类型
    mask = mask>0
    # 增加一个维度 -> [b, 1, 50, 50]
    mask = mask.unsqueeze(1)

    return mask

model.py

import torch
import util
import mask

# 编码器层
class EncoderLayer(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.mh = util.MultiHeadAttention()
        self.fc = util.FullyConnectedOutput()

    def forward(self, x, mask):
        # 计算自注意力,维度不变
        # [b, 50, 32] -> [b, 50, 32]
        score = self.mh(x, x, x, mask)
        # 全连接输出,维度不变
        # [b, 50, 32] -> [b, 50, 32]
        out = self.fc(score)

        return out

class Encoder(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layer_1 = EncoderLayer()
        self.layer_2 = EncoderLayer()
        self.layer_3 = EncoderLayer()

    def forward(self, x, mask):
        x = self.layer_1(x, mask)
        x = self.layer_2(x, mask)
        x = self.layer_3(x, mask)

        return x

# 解码器层
class DecoderLayer(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.mh_1 = util.MultiHeadAttention()
        self.mh_2 = util.MultiHeadAttention()
        self.fc = util.FullyConnectedOutput()
    
    def forward(self, x, y, mask_pad_x, mask_tril_y):
        # 先计算y的自注意力,维度不变
        # [b, 50, 32] -> [b, 50, 32]
        y = self.mh_1(y, y, y, mask_tril_y)
        # 结合x和y的注意力计算,维度不变
        # [b, 50, 32],[b, 50, 32] -> [b, 50, 32]
        y = self.mh_2(y, x, x, mask_pad_x)
        # 全连接输出,维度不变
        # [b, 50, 32] -> [b, 50, 32]
        out = self.fc(y)

        return out

class Decoder(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layer_1 = DecoderLayer()
        self.layer_2 = DecoderLayer()
        self.layer_3 = DecoderLayer()

    def forward(self, x, y, mask_pad_x, mask_tril_y):
        y = self.layer_1(x, y, mask_pad_x, mask_tril_y)
        y = self.layer_2(x, y, mask_pad_x, mask_tril_y)
        y = self.layer_3(x, y, mask_pad_x, mask_tril_y)
        
        return y

# 主模型
class Transformer(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.x_embed = util.PositionEmbedding()
        self.y_embed = util.PositionEmbedding()
        self.encoder = Encoder()
        self.decoder = Decoder()
        self.fc_out = torch.nn.Linear(32, 39)

    def forward(self, x, y):
        # 创建mask [b, 1, 50, 50]
        mask_pad_x = mask.mask_pad(x)
        mask_tril_y = mask.mask_tril(y)
        # 编码,添加位置信息
        # x = [b, 50] -> [b, 50, 32]
        # y = [b, 50] -> [b, 50, 32]
        x = self.x_embed(x)
        y = self.y_embed(y)
        # 编码层计算
        # [b, 50, 32] -> [b, 50, 32]
        x = self.encoder(x, mask_pad_x)
        # 解码层计算
        # [b, 50, 32],[b, 50, 32] -> [b, 50, 32]
        y = self.decoder(x, y, mask_pad_x, mask_tril_y)
        # 全连接输出,维度不变
        # [b, 50, 32] -> [b, 50, 39]
        out = self.fc_out(y)

        return out

main.py

import torch
from torch.nn.modules.loss import CrossEntropyLoss
import model
import mask
from data import dict_x, dict_y, loader
import util

# 预测函数，循环执行transformer。得到输出，每次transformer预测一个词
def predict(x):
    # x = [1, 50]
    transformer.eval()# 开启评估模式
    # [1, 1, 50, 50]
    mask_pad_x = mask.mask_pad(x)
    # 初始化输出,这个是固定值
    # [1, 50]
    # [[0,2,2,2...]]
    target = [dict_y['<SOS>']]+[dict_y['<PAD>']]*49
    target = torch.tensor(target).unsqueeze(0)
    # x编码,添加位置信息
    # [1, 50] -> [1, 50, 32]
    x = transformer.x_embed(x)
    # 编码层计算,维度不变
    # [1, 50, 32] -> [1, 50, 32]
    x = transformer.encoder(x, mask_pad_x)
    # 遍历生成第1个词到第49个词
    for i in range(49):
        # [1, 50]
        y = target
        # 生成mask掩码 [1, 1, 50, 50]
        mask_tril_y = mask.mask_tril(y)
        # y编码,添加位置信息
        # [1, 50] -> [1, 50, 32]
        y = transformer.y_embed(y)
        # 解码层计算,维度不变
        # [1, 50, 32],[1, 50, 32] -> [1, 50, 32]
        y = transformer.decoder(x, y, mask_pad_x, mask_tril_y)
        # 全连接输出,39分类
        # [1, 50, 32] -> [1, 50, 39]
        out = transformer.fc_out(y)
        # 取出当前词的输出
        # [1, 50, 39] -> [1, 39]
        out = out[:, i, :]
        # 取出分类结果
        # [1, 39] -> [1]
        out = out.argmax(dim=1).detach()# 未解：为什么要加detach？
        # 以当前词预测下一个词,填到结果中
        target[:, i+1] = out
    
    return target

transformer = model.Transformer()
loss = torch.nn.CrossEntropyLoss()
trainer = torch.optim.Adam(transformer.parameters(), lr=2e-3)
epochs = 1# 训练迭代次数
# 训练模型
for epoch in range(epochs):
    for i, (x, y) in enumerate(loader):
        # x = [8, 50]
        # y = [8, 51]

        # 在训练时,是拿y的每一个字符输入,预测下一个字符,所以不需要最后一个字
        # [8, 50, 39]
        pred = transformer(x, y[:, :-1])
        # [8, 50, 39] -> [400, 39]
        pred = pred.reshape(-1, 39)
        # [8, 51] -> [400]
        y = y[:, 1:].reshape(-1)
        # 忽略pad
        select = y!=dict_y['<PAD>']
        pred = pred[select]
        y = y[select]
        # 计算梯度并更新
        l = loss(pred, y)
        trainer.zero_grad()
        l.backward()
        trainer.step()
        # 打印数据
        if 0 == i%200:
            # [select, 39] -> [select]
            pred = pred.argmax(dim=1)
            correct = (pred==y).sum().item()# 预测正确的词的数量
            accuracy = correct/len(pred)
            print('epoch:{}, i:{}, loss:{}, accuracy:{}'.format(epoch, i, l, accuracy))

运行结果

EnjoyFailure

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Transformer

它在每个样本的所有特征上进行归一化，使得输出的均值为0，标准差为1。Q,K,V是由输入的词向量x经过线性变换得到的,其中各个矩阵w可以经过学习得到, 这种变换可以提升模型的拟合能力, 得到的Q,K,V 可以理解为。但如果引入Attention，就需要将这组V分别乘以一组权重α \alphaα，那么就可以做到有重点性地关注输入特征，如同人的注意力一般。当我们直接把一组V输入到网络中进行训练，那这个网络就是没有引入Attention机制的网络。，但目前我们现在无需关注是如何产生这组向量的。
复制链接

扫一扫

专栏目录