参考这篇文章 将模型从 keras 的 LSTM 修改成 pytorch 上的 Transformer 模型,这里只给出与原文不同的地方。
数据转换成 tensor
# 把 数据转换成tensor
import torch
x_train_tensor = torch.tensor(x_train)
x_test_tensor = torch.tensor(x_test)
y_train_tensor = torch.tensor(y_train)
y_test_tensor = torch.tensor(y_test)
decoder_output_train_tensor = torch.tensor(decoder_output_train).long()
decoder_output_test_tensor = torch.tensor(decoder_output_test).long()
训练模型
搭建网络
import torch
import torch.nn as nn
import torch.nn.functional as F
# 设置 cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SOS_token = tokenizer.word_index['<sos>']
EOS_token = tokenizer.word_index['<eos>']
PAD_token = tokenizer.word_index['<pad>']
SOS_token,EOS_token,PAD_token
(3, 4, 0)
位置编码,直接用
import math
class PositionalEncoding(nn.Module):
"Implement the PE function."
def __init__(self, d_model, dropout, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
# 初始化Shape为(max_len, d_model)的PE (positional encoding)
pe = torch.zeros(max_len, d_model)
# 初始化一个tensor [[0, 1, 2, 3, ...]]
position = torch.arange(0, max_len).unsqueeze(1)
# 这里就是sin和cos括号中的内容,通过e和ln进行了变换
div_term = torch.exp(
torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
)
# 计算PE(pos, 2i)
pe[:, 0::2] = torch.sin(position * div_term)
# 计算PE(pos, 2i+1)
pe[:, 1::2] = torch.cos(position * div_term)
# 为了方便计算,在最外面在unsqueeze出一个batch
pe = pe.unsqueeze(0)
# 如果一个参数不参与梯度下降,但又希望保存model的时候将其保存下来
# 这个时候就可以用register_buffer
self.register_buffer("pe", pe)
def forward(self, x):
"""
x 为embedding后的inputs,例如(1,7, 128),batch size为1,7个单词,单词维度为128
"""
# 将x和positional encoding相加。
x = x + self.pe[:, : x.size(1)].requires_grad_(False)
return self.dropout(x)
Transformer 模型
class PoetTaskModel(nn.Module):
def __init__(self, d_model=128):
super(PoetTaskModel, self).__init__()
# 词典数为10000
self.embedding = nn.Embedding(num_embeddings=10000, embedding_dim=128)
self.transformer = nn.Transformer(d_model=128, nhead = 4 ,num_encoder_layers=5, num_decoder_layers=3, dim_feedforward=128, batch_first=True)
self.positional_encoding = PositionalEncoding(d_model, dropout=0)
# 定义最后的线性层,这里并没有用Softmax,因为没必要。
# 因为后面的CrossEntropyLoss中自带了
self.predictor = nn.Linear(128, 10000)
def forward(self, src, tgt):
# 生成mask
tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size()[-1])
src_key_padding_mask = PoetTaskModel.get_key_padding_mask(src)
tgt_key_padding_mask = PoetTaskModel.get_key_padding_mask(tgt)
src = self.embedding(src)
tgt = self.embedding(tgt)
src = self.positional_encoding(src)
tgt = self.positional_encoding(tgt)
# 将准备好的数据送给transformer
out = self.transformer(src, tgt,
tgt_mask=tgt_mask,
src_key_padding_mask=src_key_padding_mask,
tgt_key_padding_mask=tgt_key_padding_mask)
"""
这里直接返回transformer的结果。因为训练和推理时的行为不一样,
所以在该模型外再进行线性层的预测。
"""
return out
@staticmethod
def get_key_padding_mask(tokens):
"""
用于key_padding_mask
"""
key_padding_mask = torch.zeros(tokens.size())
key_padding_mask[tokens == PAD_token] = -torch.inf
return key_padding_mask
实例化网络
model = PoetTaskModel()
model
# 测试样本能否正常输入网络
src = x_train_tensor[:1]
tgt = y_train_tensor[:1]
out = model(src, tgt)
out.shape # torch.Size([1, 10, 10000]) 代表 batch_size, seq_len, vocab_size
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
训练
from tqdm import tqdm
batch_size = 128
epochs = 5
for epoch in range(epochs):
total_loss = 0
for i in tqdm(range(0, len(x_train_tensor), batch_size)):
src = x_train_tensor[i:i+batch_size]
tgt = y_train_tensor[i:i+batch_size]
tgt_y = decoder_output_train_tensor[i:i+batch_size]
# 清空梯度
optimizer.zero_grad()
# 进行transformer的计算
out = model(src, tgt)
# 将结果送给最后的线性层进行预测
out = model.predictor(out)
# 计算loss
loss = criterion(out.view(-1, out.size(-1)), tgt_y.view(-1))
# 计算梯度
loss.backward()
# 更新参数
optimizer.step()
total_loss += loss.item()
print("epoch: {}, loss: {}".format(epoch, total_loss / len(x_train_tensor)))
# 计算训练集上的准确率
model.eval()
with torch.no_grad():
src = x_train_tensor
tgt = y_train_tensor
tgt_y = decoder_output_train_tensor
out = model(src, tgt)
out = model.predictor(out)
out = out.argmax(dim=-1)
acc = (out == tgt_y).float().mean()
print("train acc: {}".format(acc))
# 计算测试集上的准确率
with torch.no_grad():
src = x_test_tensor
tgt = y_test_tensor
tgt_y = decoder_output_test_tensor
out = model(src, tgt)
out = model.predictor(out)
out = out.argmax(dim=-1)
acc = (out == tgt_y).float().mean()
print("test acc: {}".format(acc))
测试模型
test_string = '白日依山盡,黃河入海流。'
# 把 test_string 每个字之间添加空格
test_string = ' '.join(test_string)
# 前后加上开始和结束标志
# test_string = '<SOS> ' + test_string
# 把 test_string 转化为token
test_string_token = tokenizer.texts_to_sequences([test_string])
# 截取后10个字
test_string_token = test_string_token[0][-10:]
# 转化为 numpy,补齐
test_string_mat = pad_sequences([test_string_token],maxlen=10,padding='post',truncating='post')
# test_string_mat 转化为 tensor
test_string_tensor = torch.tensor(test_string_mat).to(device)
test_string_tensor
tensor([[303, 9, 93, 1, 159, 227, 80, 129, 95, 2]], dtype=torch.int32)
编写解码函数 decode
def decode(model, src, max_iter, SOS_token, EOS_token, PAD_token):
model = model.eval()
src = src
tgt = torch.LongTensor([[SOS_token]]).to(device)
# 一个一个词预测,直到预测为<eos>,或者达到句子最大长度
for i in range(max_iter):
out = model(src, tgt)
predict = model.predictor(out[:, -1])
# 找出最大值的index
y = torch.argmax(predict, dim=1)
# 和之前的预测结果拼接到一起
tgt = torch.concat([tgt, y.unsqueeze(0)], dim=1)
# 如果为<eos>,说明预测结束,跳出循环
if y == EOS_token:
break
return tgt
decoded = decode(model,
src=test_string_tensor,
max_iter=100,
SOS_token=SOS_token,
EOS_token=EOS_token,
PAD_token=PAD_token)
decoded
tensor([[ 3, 9, 5, 28, 1, 5, 28, 28, 6, 2, 4]])
# 把 tgt 中的数字转化为文字
tokenizer.sequences_to_texts(decoded.cpu().numpy())
[‘< sos > 山 不 見 , 不 見 見 人 。 < eos >’]