基于Transformer的数字字母翻译
内容转自添加链接描述
文章目录
1. 任务描述
给定若干句话,由小写字母和数字构成的词表组成,每句话长度大小在30-48字符之间,其中词表由以一定的概率分布取值。现将小写字母翻译成大写字母,并将相应数字翻译成9-int(i),最后逆序输出,如:aa123,翻译成3321AA。需要注意的是为了让翻译后的句子和原句子序列长度不相同,翻译时在句子的第一个字符再次复制了原句子的最后一个字符的翻译情况,即若原句子是aa123,那么翻译后是3321AA,若原句子是123aa,翻译后就是AAA321。最后为了让每个句子长度一致,分别在每句话之前加上<SOS>,每句话之后加上<EOS>以及<PAD>,如下:
2. 代码文件结构
- mydata.py 定义实验数据
- mymain.py 主程序,训练+测试
- mymask.py 定义mask函数
- mymodels.py 定义模型结构
- myutil.py 定义工具函数
3. mydata.py
import random
import numpy as np
import torch
from torch.utils.data import DataLoader,Dataset
# 定义字典
zidian_x = '<SOS>,<EOS>,<PAD>,0,1,2,3,4,5,6,7,8,9,q,w,e,r,t,y,u,i,o,p,a,s,d,f,g,h,j,k,l,z,x,c,v,b,n,m'
'''zidian_x -----> {'<SOS>': 0, '<EOS>': 1, '<PAD>': 2, '0': 3, '1': 4, '2': 5, '3': 6, '4': 7, '5': 8, '6': 9,
'7': 10, '8': 11, '9': 12, 'q': 13, 'w': 14, 'e': 15, 'r': 16, 't': 17, 'y': 18, 'u': 19, 'i': 20, 'o': 21, 'p': 22,
'a': 23, 's': 24, 'd': 25, 'f': 26, 'g': 27, 'h': 28, 'j': 29, 'k': 30, 'l': 31, 'z': 32, 'x': 33, 'c': 34, 'v': 35,
'b': 36, 'n': 37, 'm': 38} '''
zidian_x = {word: i for i, word in enumerate(zidian_x.split(','))}
'''zidian_xr -----> ['<SOS>', '<EOS>', '<PAD>', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'q', 'w', 'e', 'r',
't', 'y', 'u', 'i', 'o', 'p', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm'] '''
zidian_xr = [k for k, v in zidian_x.items()]
'''zidian_y -----> {'<SOS>': 0, '<EOS>': 1, '<PAD>': 2, '0': 3, '1': 4, '2': 5, '3': 6, '4': 7, '5': 8, '6': 9,
'7': 10, '8': 11, '9': 12, 'Q': 13, 'W': 14, 'E': 15, 'R': 16, 'T': 17, 'Y': 18, 'U': 19, 'I': 20, 'O': 21, 'P': 22,
'A': 23, 'S': 24, 'D': 25, 'F': 26, 'G': 27, 'H': 28, 'J': 29, 'K': 30, 'L': 31, 'Z': 32, 'X': 33, 'C': 34, 'V': 35,
'B': 36, 'N': 37, 'M': 38} '''
zidian_y = {k.upper(): v for k, v in zidian_x.items()}
'''zidian_yr ----> ['<SOS>', '<EOS>', '<PAD>', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'Q', 'W', 'E', 'R',
'T', 'Y', 'U', 'I', 'O', 'P', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Z', 'X', 'C', 'V', 'B', 'N', 'M'] '''
zidian_yr = [k for k, v in zidian_y.items()]
def get_data():
# 定义词集合
words = [
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'q', 'w', 'e', 'r',
't', 'y', 'u', 'i', 'o', 'p', 'a', 's', 'd', 'f', 'g', 'h', 'j', 'k',
'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm'
]
# 定义每个词被选中的概率
p = np.array(
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26])
p = p / p.sum()
# 随机选择n个词
n = random.randint(30, 48)
x = np.random.choice(words, size=n, replace=True, p=p)
# 采样的结果就是x
x = x.tolist()
# y是对x变换得到的,字母大写,数字取9以内的互补数
def f(i):
i = i.upper()
if not i.isdigit():
return i
i = 9 - int(i)
return str(i)
y = [f(i) for i in x]
y = y + [y[-1]]
# 逆序
y = y[::-1]
# 加上收尾符号
x = ['<SOS>'] + x + ['<EOS>']
y = ['<SOS>'] + y + ['<EOS>']
# 补pad到固定长度
x = x + ['<PAD>'] * 50
y = y + ['<PAD>'] * 51
x = x[:50]
y = y[:51]
# 编码成数据
x = [zidian_x[i] for i in x]
y = [zidian_y[i] for i in y]
# 转tensor
x = torch.LongTensor(x)
y = torch.LongTensor(y)
return x, y
# print(get_data())
# 定义数据及
class dataset(Dataset):
def __init__(self):
super(dataset,self).__init__()
def __len__(self):
return 100000
def __getitem__(self, item):
return get_data()
# 数据加载器
loader = DataLoader(dataset=dataset(),
batch_size=8,
drop_last=True, # 丢弃掉最后batchsize少于一个epoch的样本数量
shuffle=True,
collate_fn=None) # 如何取样本的,我们可以定义自己的函数来准确地实现想要的功能)
4. myutil.py
【注】:在包含LN和Res-Add的层结构中,原视频作者说先LN、再计算Attention(Linear-ReLU)、再Res-Add的效果优于原框架中先Attention(Linear-ReLU)、再Res-Add,再LN的效果。
import math
import torch
from torch.nn import Module, Linear
import torch.nn as nn
# 注意力计算函数
def attention(Q, K, V, mask):
# b句话,每句话50个词,每个词编码成32维向量,4个头,每个头分到8维向量
# Q,K,V = [b,4,50,8]
# Q,K矩阵相乘,求每个词对其他所有词的注意力
# [b,4,50,8] * [b,4,8,50] = [b,4,50,50]
score = torch.matmul(Q, K.permute(0, 1, 3, 2))
# 除以每个头维数的平方根,做数值缩放
score /= 8 ** 0.5
# mask遮盖,mask是True的地方都被替换成-inf,经过softmax之后就为0
# mask = [b, 1, 50, 50]
score = score.masked_fill_(mask, -float('inf'))
score = torch.softmax(score, dim=-1)
# 注意力分数乘以V,得到最终的注意力结果
# [b,4,50,50] * [b,4,50,8] = [b,4,50,8]
score = torch.matmul(score, V)
# 每个头计算的结果合一
# [b,4,50,8] ->[b,50,32]
score = score.permute(0, 2, 1, 3).reshape(-1, 50, 32)
return score
# 多头注意力层计算
class MultiHead(Module):
def __init__(self):
super().__init__()
self.fc_Q = Linear(32, 32)
self.fc_K = Linear(32, 32)
self.fc_V = Linear(32, 32)
self.out_fc = nn.Linear(32, 32)
# 使用LN进行归一化
self.norm = nn.LayerNorm(normalized_shape=32, elementwise_affine=True)
# normalized_shape若传入一个整数,则必须和输入的最后一个维度相等,对输入的最后一个维度进行归一化
# eg. input【3,4】,此时若normalized_shape=4,则对这三行(四维)分别进行LN;若normalized_shape=[3,4],则对这12个数进行LN
# elementwise_affine如果设为False,则LayerNorm层不含有任何可学习参数。
# elementwise_affine如果设为True(默认是True)则会包含可学习参数weight和bias,用于仿射变换,
# 即对输入数据归一化到均值0方差1后,乘以weight,即bias。
self.dropout = nn.Dropout(0.1)
def forward(self, Q, K, V, mask):
# b句话,每句话50个词,每个词编码成32维向量
# Q,K,V = [b,50,32]
b = Q.shape[0]
# 保留下原始的Q,后面要做短接用
clone_Q = Q.clone()
# 规范化
Q = self.norm(Q)
K = self.norm(K)
V = self.norm(V)
# 线性运算,维度不变 [b,50,32] * [b,32,32]
# [b,50,32] -> [b,50,32]
K = self.fc_K(K)
Q = self.fc_Q(Q)
V = self.fc_V(V)
# 拆分成多个多个头
# b句话,每句话50个单词,每个词编码成32维向量,4个头,每个头分到8维向量
# [b,50,32] -> [b,4,50,8]
Q = Q.reshape(b, 50, 4, 8).permute(0, 2, 1, 3)
K = K.reshape(b, 50, 4, 8).permute(0, 2, 1, 3)
V = V.reshape(b, 50, 4, 8).permute(0, 2, 1, 3)
# 计算注意力
# [b,4,50,8] -> [b,50,32]
score = attention(Q, K, V, mask)
# 计算输出,维度不变
# [b,50,32] -> [b,50,32]
score = self.dropout(self.out_fc(score))
# 短接
score = clone_Q + score
return score
# 位置编码
class PositionalEmbedding(Module):
def __init__(self):
super().__init__()
# pos是第几个词,i是第一个维度,d_model是维度总数
def get_pre(pos, dim, d_model):
fenmu = 1e4 ** (dim / d_model)
posmask = pos / fenmu
if dim % 2 == 0:
return math.sin(posmask)
return math.cos(posmask)
# 初始化位置编码矩阵
pe = torch.empty(50, 32)
for i in range(50):
for j in range(32):
pe[i,j] = get_pre(i, j, 32)
pe = pe.unsqueeze(0) # [1,50,32]
# 定义为不更新的常量
self.register_buffer('pe', pe)
# 词编码层
self.embed = nn.Embedding(39, 32) # 3+10+26=39
# 初始化参数
self.embed.weight.data.normal_(0, 0.1)
def forward(self, x):
# [8,50] -> [8,50,32]
embed = self.embed(x)
# 词编码和位置编码相加
# [8,50,32] + [1,50,32] -> [8,50,32]
embed = embed + self.pe
return embed
# 全连接输出层
class FullyConnectedOutput(Module):
def __init__(self):
super().__init__()
self.fc = nn.Sequential(
Linear(in_features=32, out_features=64),
nn.ReLU(),
Linear(in_features=64, out_features=32),
nn.Dropout(0.1)
)
self.norm = nn.LayerNorm(normalized_shape=32, elementwise_affine=True)
def forward(self, x):
# 保留下原始的x,后面要做短接用
clone_x = x.clone()
# 规范化
x = self.norm(x)
# 线性全连接层
# [b,50,32] -> [b,50,32]
out = self.fc(x)
# 做短接
out = clone_x + out
return out
5. mymask.py
import torch
from mydata import zidian_x,zidian_y
def mask_pad(data):
# b句话,每句话50个单词,这里是还没embed的
# data = [b,50]
# 判断每个是不是pad
mask = data == zidian_x['<PAD>']
# [b,50] -> [b,1,1,50]
mask = mask.reshape(-1,1,1,50)
# 在计算注意力时,是计算50个词和50个词之间的注意力,所以是个50*50的矩阵
# 是pad的列是true。意味着任何词对pad的注意力都是0
# 此处并没有mask掉pad对其他词的注意力
# 复制n次
# [b, 1, 1, 50] -> [b, 1, 50, 50]
mask = mask.expand(-1, 1, 50, 50)
return mask
def mask_tril(data):
# b句话,每句话50个词,这里是没embed的
# data = [b,50]
# 50*50的矩阵表示每个词对其他词是否可见
# 上三角矩阵,不包括对角线,意味着,对每个词而言,只能看到自己和之前的词,之后的词看不到
# [1,50,50]
tril = 1 - torch.tril(torch.ones(1,50,50,dtype=torch.long))
# 判断y当中每个词是不是pad,如果是pad则不见
# [b,50]
mask = data == zidian_y['<PAD>']
# 转型[b,1,50]
mask = mask.unsqueeze(1).long()
# mask和tril求并集
# [b,1,50] + [1,50,50] -> [b,50,50]
mask = mask + tril
mask = mask>0
mask = (mask==1).unsqueeze(dim=1) # [b,1,50,50]
return mask
6. mymodel.py
import torch
import torch.nn as nn
from mymask import mask_pad,mask_tril
from myutil import MultiHead,PositionalEmbedding,FullyConnectedOutput
# 编码层
class EncoderLayer(nn.Module):
def __init__(self):
super().__init__()
self.mh = MultiHead()
self.fc = FullyConnectedOutput()
def forward(self,x,mask):
score = self.mh(x,x,x,mask)
out = self.fc(score)
return out
class Encoder(nn.Module):
def __init__(self):
super().__init__()
self.layer1 = EncoderLayer()
self.layer2 = EncoderLayer()
self.layer3 = EncoderLayer()
def forward(self,x,mask):
x = self.layer1(x,mask)
x = self.layer2(x,mask)
x = self.layer3(x,mask)
return x
# 解码器
class Decoder(nn.Module):
def __init__(self):
super().__init__()
self.mh1 = MultiHead()
self.mh2 = MultiHead()
self.fc = FullyConnectedOutput()
def forward(self,x,y,mask_pad_x,mask_tril_x):
# 先计算y的自注意力,维度不变 [b,50,32]
y = self.mh1(y,y,y,mask_tril_x)
# 结合x和y的注意力计算,维度不变 [b,50,32]
y = self.mh2(y,x,x,mask_pad_x)
# 全连接输出,维度不变 [b,50,32]
y = self.fc(y)
return y
# Transformer 模型
class Transformer(nn.Module):
def __init__(self):
super().__init__()
self.embed_x = PositionalEmbedding()
self.embed_y = PositionalEmbedding()
self.encoder = Encoder()
self.decoder = Decoder()
self.fc_out = nn.Linear(32,39)
def forward(self,x,y):
# mask_pad_x / mask_tril_x = [b,1,50,50]
mask_pad_x = mask_pad(x)
mask_tril_x = mask_tril(x)
# 编码,添加位置信息
# x/y [b,50] -> [b,50,32]
x,y = self.embed_x(x),self.embed_y(y)
# 编码层计算
# [b,50,32] -> [b,50,32]
x = self.encoder(x,mask_pad_x)
# 解码层计算
# [b,50,32] -> [b,50,32]
y = self.decoder(x,y,mask_pad_x,mask_tril_x)
# 全连接输出,维度不变
y = self.fc_out(y)
return y
7. mymain.py
import torch
import torch.nn as nn
from mydata import zidian_y, loader, zidian_xr, zidian_yr
from mymask import mask_pad, mask_tril
from mymodel import Transformer
model = Transformer()
loss_func = nn.CrossEntropyLoss()
optim = torch.optim.Adam(model.parameters(), lr=2e-3)
sched = torch.optim.lr_scheduler.StepLR(optim, step_size=3, gamma=0.5)
# 预测函数
def predict(x):
# x [1,50]
model.eval()
mask_pad_x = mask_pad(x)
# 初始化输出,是个固定值
target = [zidian_y['<SOS>']] + [zidian_y['<PAD>']] * 49 # [1,50]
target = torch.LongTensor(target).unsqueeze(0) # [1,1,50]
# x编码,添加位置信息
# [1,50] -> [1,50,32]
x = model.embed_x(x)
# 编码层计算,维度不变
x = model.encoder(x, mask_pad_x)
# 遍历生成第1个词到第49个词
for i in range(49):
y = target
mask_tril_y = mask_tril(y)
# y编码,添加位置信息
y = model.embed_y(y) # [1,50,32]
# 解码层计算,维度不变
y = model.decoder(x, y, mask_pad_x, mask_tril_y)
# 全连接输出,39分类
out = model.fc_out(y)
# 取出当前词的输出
# [1,50,30] -> [1,39]
out = out[:, i, :]
# 取出分类结果
out = out.argmax(dim=1).detach()
# 以当前词预测下一个词,填到结果中
target[:, i + 1] = out
return target
for epoch in range(1):
for i, (x, y) in enumerate(loader):
# x [8,50]
# y [8,51]
pred = model(x, y[:, :-1]) # [8,50,39]
pred = pred.reshape(-1, 39) # [400,39]
# [8,51] -> [400]
y = y[:, 1:].reshape(-1)
# 忽略pad
select = y != zidian_y['<PAD>']
pred = pred[select]
y = y[select]
loss = loss_func(pred, y)
optim.zero_grad()
loss.backward()
optim.step()
if i % 200 == 0:
# [select, 39] -> [select]
pred = pred.argmax(1)
correct = (pred == y).sum().item()
accuracy = correct / len(pred)
lr = optim.param_groups[0]['lr']
print('epoch: ', epoch, ', i: ', i, ', lr: ', lr, ', loss: ', loss.item(), ', accuracy:', accuracy)
sched.step()
for i, (x, y) in enumerate(loader):
break
for i in range(8):
print(i)
print(''.join([zidian_xr[i] for i in x[i].tolist()]))
print(''.join([zidian_yr[i] for i in y[i].tolist()]))
print(''.join([zidian_yr[i] for i in predict(x[i].unsqueeze(0))[0].tolist()]))
8. 实验结果
<EOS>之前的基本都翻译准确。