目录
前言
这两天学习了RNN,就想实现一下机器翻译,如果有什么问题,可以随时交流!
1. 数据集
本文的数据集来自李沐老师的《动手学深度学习》
数据集下载 : https://github.com/codefly-xtl/translation/tree/main/data
首先看看数据集啥样子:左边为英语,右边为法语
1.1 下载数据集并处理
在这一部分,我们首先读取出数据存放到 raw_data 中,再使用空格替换不间断字符,再小写。再对标点符号前添加空格,最后返回数据。
def process_data():
# 判断标点前是否有空格
def no_peace(char, pre_char):
return char in set(',.!?') and pre_char != ' '
# 加载数据
with open('./data/fra.txt', encoding='utf-8') as f:
raw_data = f.read()
# 对数据进行处理:变小写,在标点符号前插入空格
raw_data = raw_data.replace('\u202f', ' ').replace('\xa0', ' ').lower()
out = [' ' + char if i > 0 and no_peace(char, raw_data[i - 1]) else char for i, char in enumerate(raw_data)]
data = ''.join(out)
return data
1.2 将数据集分为source和target
在这一部分,我们获取原文以及译文的句子,source存放原文,target存放译文。根据 \n 取出数据的每一行,再根据 \t 将数据分为原文以及译文,再分别对原文以及译文根据 空格 将句子按词分开组成列表。source例子如下:source = [[‘i’, ‘am’, ‘person’],[‘i’, ‘like’, ‘you’]]
def get_sentence(data):
# 存储两种语言
source = []
target = []
# 取出每一行
for line in data.split('\n'):
# 取出每一行的两个部分
parts = line.split('\t')
if len(parts) == 2:
# 将英语放入source
source.append(parts[0].split(' '))
# 将法语放入target
target.append(parts[1].split(' '))
# source 的样子如下:
# source = [['i', 'am', 'person'],['i', 'like', 'you']]
return source, target
1.3 定义词汇类
这个类的创建过程如下:
- 先从句子列表中读取到每一个单词,得到all_words
- 再按单词出现的频率排序得到word_preq
- 接下来就实现index_to_word以及word_to_index,这两个可以实现单个index和对应的word之间的互转
- 接下来实现to_index以及wo_word,这两个可以实现index列表和对应的word列表之间的互相转换,例如:index = [88,102,562,4850] 转为 word = [‘点’,‘个’,‘赞’,‘😀’],to_index输出的是tensor格式
- 为了可以对一句话实现翻译,因此设计了一个prase的方法,可以将句子转换为对应的index的tensor格式,返回的数据是二维的,shape为 (batch_size , num+steps)
- 实现获取词汇表大小的**len()**方法
# 词汇类
class Vocab:
# reserved_tokens 是一个预留token,比如预留开始字符<bos>
def __init__(self, sentence, min_freq=0, reserved_tokens=None):
if reserved_tokens is None:
reserved_tokens = []
# 取出所有单词
self.all_words = [word for words in sentence for word in words]
# 统计词频,并按频率大小从大到小排序
self.word_preq = self.get_word_preq()
# 首先将<unk>以及预留字放入index_to_word以及word_to_index,index_to_word可以根据index获得word,word_to_index根据word获得index
self.index_to_word = ['<unk>'] + reserved_tokens
self.word_to_index = {word: index for index, word in enumerate(self.index_to_word)}
# 再将所有词放入index_to_word以及word_to_index
for word, freq in self.word_preq:
if freq < min_freq:
break
self.index_to_word.append(word)
self.word_to_index[word] = len(self.word_to_index)
# 统计词频
def get_word_preq(self):
word_preq = {}
for word in self.all_words:
if word not in word_preq:
word_preq[word] = 1
else:
word_preq[word] += 1
# 排序
word_preq = sorted(word_preq.items(), key=lambda x: x[1], reverse=True)
return word_preq
# 获取词数
def __len__(self):
return len(self.index_to_word)
# 将index列表转为word列表
def to_word(self, indexs):
return [self.index_to_word[i] for i in indexs]
# 将word列表转为index列表
def to_index(self, words):
output = []
for word in words:
if word not in self.index_to_word:
output.append(self.word_to_index['<unk>'])
else:
output.append(self.word_to_index[word])
return torch.tensor(output)
# 将一句话转为对应的tensor数据
def prase(self, raw_data, num_steps):
raw_data = raw_data.replace('\u202f', ' ').replace('\xa0', ' ').lower()
out = [' ' + char if i > 0 and no_peace(char, raw_data[i - 1]) else char for i, char in enumerate(raw_data)]
data = ''.join(out)
source = data.split(' ')
source.append('<eos>')
source_valid_len =len(source)
source_word = truncate_or_pad(source, num_steps)
source_index = self.to_index(source_word)
print(source_index)
return torch.tensor(source_index).unsqueeze(0), torch.tensor(source_valid_len).reshape(-1, 1)
1.4 获取训练集
在这一部分,用于获取训练集,步骤如下:
- 首先获取数据,在将数据分为source_sentences, target_sentences
- 分别对这上述两个句子列表形成词汇表source_Vocab 和target_Vocab
- 根据句子的多少以及batch_size计算出整个数据集可以形成多少个batch
- 每一个batch,存放四部分:source_batch,source_len_batch,target_batch,target_len_batch,其中,_batch存放句子,_len_batch用于存放每一个句子中有效长度为多少。
# 用于填充字符或者截断句子
def truncate_or_pad(line, num_steps):
# 例: line = ['i','am','person']
# 超出后进行截断
if len(line) > num_steps:
return line[:num_steps]
# 没有超出就pad
for i in range(num_steps - len(line)):
line.append('<pad>')
return line
def get_train_iter(batch_size, num_steps):
data = process_data()
# source_sentences 例: source_sentences = [['i am person'],['i like you']]
source_sentences, target_sentences = get_sentence(data)
source_Vocab = Vocab(source_sentences, min_freq=0, reserved_tokens=['<pad>', '<bos>', '<eos>'])
target_Vocab = Vocab(target_sentences, min_freq=0, reserved_tokens=['<pad>', '<bos>', '<eos>'])
database = []
batch_num = len(source_sentences) // batch_size
# 每一个batch放在database里面
for i in range(batch_num):
source_batch = []
source_len_batch = []
target_batch = []
target_len_batch = []
for j in range(batch_size):
# 获取一个句子以及翻译
source_sentence = source_sentences[i * batch_size + j] + ['<eos>']
target_sentence = target_sentences[i * batch_size + j] + ['<eos>']
source_valid_len = len(source_sentence)
target_valid_len = len(target_sentence)
# 将句子变为单词列表,超过num_steps的截断,不够num_steps的补齐
source_word = truncate_or_pad(source_sentence, num_steps)
target_word = truncate_or_pad(target_sentence, num_steps)
# 获取单词对应的标号
source_index = source_Vocab.to_index(source_word)
target_index = target_Vocab.to_index(target_word)
# 存放起来
source_batch.append(source_index)
source_len_batch.append(source_valid_len)
target_batch.append(target_index)
target_len_batch.append(target_valid_len)
source_batch_tensor = torch.stack(source_batch)
target_batch_tensor = torch.stack(target_batch)
source_len_batch_tensor = torch.tensor(source_len_batch)
target_len_batch_tensor = torch.tensor(target_len_batch)
database.append((source_batch_tensor, source_len_batch_tensor, target_batch_tensor, target_len_batch_tensor))
return database, source_Vocab, target_Vocab
2. 定义模型
2.1 导入相关工具包
import torch
from torch import nn
import utils
2.2. 定义Encoder模型
class Encoder(nn.Module):
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, bidirectional=False):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.LSTM(embed_size, num_hiddens, num_layers, bidirectional=bidirectional)
self.num_layers = num_layers
self.num_hiddens = num_hiddens
self.bidirectional = bidirectional
if bidirectional:
# 由于每一层有两个方向,因此需要将两个方向进行合并
self.linear_hidden = nn.Linear(self.num_hiddens * 2, self.num_hiddens)
self.linear_content = nn.Linear(self.num_hiddens * 2, self.num_hiddens)
def forward(self, X):
X = self.embedding(X)
X = X.permute(1, 0, 2)
output, state = self.rnn(X)
hidden_state, content_state = state
if self.bidirectional:
# 将每一层的正反state拼在一起,再放入神经网络中,使得与decoder的num_hiddens一致
hidden_state = torch.cat(
[hidden_state[:self.num_layers * 2:2, :, :], hidden_state[1:self.num_layers * 2 + 1:2, :, :]], dim=2)
content_state = torch.cat(
[content_state[:self.num_layers * 2:2, :, :], content_state[1:self.num_layers * 2 + 1:2, :, :]], dim=2)
hidden_state = self.linear_hidden(hidden_state)
content_state = self.linear_content(content_state)
return hidden_state, content_state
2.3.定义Decoder模型
class Decoder(nn.Module):
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.LSTM(embed_size + num_hiddens * 2, num_hiddens, num_layers)
self.linear = nn.Linear(num_hiddens, vocab_size)
def init_state(self, encoder_output_state):
return encoder_output_state
def forward(self, X, state, predict=False):
if not predict:
X = self.embedding(X).permute(1, 0, 2)
# 由于decoder的信息全由encoder的最后一个时间state得到,
# 因此最后一个state的最后一层很重要,要尽可能的充分利用,
# 因此将最后一个state的最后一层也作为decoder的输入
hidden_state, content_state = state
new_hidden_state = hidden_state[-1].unsqueeze(0).repeat(target.shape[0], 1, 1)
new_content_state = content_state[-1].unsqueeze(0).repeat(target.shape[0], 1, 1)
X = torch.cat([new_hidden_state, new_content_state, X], dim=2)
# X 的shape为:(num_steps, batch_size, decoder_embed_size + encoder_hidden_num * 2)
output, state = self.rnn(X, state)
output = self.linear(output).permute(1, 0, 2)
return output, state
2.4.定义seq2seq模型
class EncoderDecoder(nn.Module):
def __init__(self, encoder, decoder):
super().__init__()
self.encoder = encoder
self.decoder = decoder
def forward(self, source, target):
encoder_output_state = self.encoder(source)
decoder_init_state = self.decoder.init_state(encoder_output_state)
return self.decoder(target, decoder_init_state)
2.5.定义loss
由于损失矩阵形状为 (batch_size, steps_num),每一个句子后边有一部分是填充过的,因此不能计算填充数据的损失
class Myloss(nn.CrossEntropyLoss):
def value_mask(self, X, valid_len):
mask = torch.arange(X.shape[1], dtype=torch.float32, device=X.device)[None, :] > valid_len[:, None]
X[mask] = 0
return X
def forward(self, predict, target, valid_len=None):
weights = torch.ones_like(target)
weights = self.value_mask(weights, valid_len)
self.reduction = 'none'
unweighted_loss = super().forward(predict.permute(0, 2, 1), target)
weighted_loss = unweighted_loss * weights
return weighted_loss.mean()
3.训练函数
def train(net, data_iter, lr, num_epochs, device):
net.to(device)
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
loss = Myloss()
net.train()
for epoch in range(num_epochs):
for batch in data_iter:
optimizer.zero_grad()
# 将数据放到device上
source, source_valid_len, target, target_valid_len = [x.to(device) for x in batch]
# 再每一个句子前面添加<bos>的index,bos的index为2
bos = torch.tensor([2] * target.shape[0], device=device).reshape(-1, 1)
decoder_input = torch.cat([bos, target[:, :-1]], dim=1)
# 进行优化
Y_hat, _ = net(source, decoder_input)
l = loss(Y_hat, target, target_valid_len)
l.backward()
optimizer.step()
print(l)
4.预测函数
def predict(net, source_sentence, source_Vocab, target_Vocab, num_steps, device):
# 用于存储译文
result = []
# 原文
source, source_valid_len = source_Vocab.prase(source_sentence, num_steps)
source, source_valid_len = source.to(device), source_valid_len.to(device)
# 获取最后一个状态
state = net.encoder(source)
# 获取encoder的最后一个state的信息
hidden_state, content_state = state
new_hidden_state = hidden_state[-1].unsqueeze(0)
new_content_state = content_state[-1].unsqueeze(0)
# 初始化decoder的第一个状态
state = net.decoder.init_state(state)
# 构造翻译的第一个词
X = torch.tensor(target_Vocab.word_to_index['<eos>']).reshape(-1, 1).to(device)
X = net.decoder.embedding(X).permute(1, 0, 2)
X = torch.cat([new_hidden_state, new_content_state, X], dim=2)
for i in range(num_steps):
# 开启预测模式,进行预测
Y, state = net.decoder(X, state, True)
X = Y.argmax(dim=2)
# 获取最大概率的index
pred = X.squeeze(dim=0).type(torch.int32).item()
# 如果index为eos,则停止预测
if pred == target_Vocab.word_to_index['<eos>']:
break
X = net.decoder.embedding(X).permute(1, 0, 2)
X = torch.cat([new_hidden_state, new_content_state, X], dim=2)
result.append(pred)
return ' '.join(target_Vocab.to_word(result))
5.测试
5.1定义参数
batch_size = 64
num_steps = 20
train_iter, source_Vocab, target_Vocab = utils.get_train_iter(batch_size, num_steps)
encoder_embed_size = 300
decoder_embed_size = 300
hidden_size = 64
num_layers = 2
encoder = Encoder(len(source_Vocab), decoder_embed_size, hidden_size, num_layers, True)
decoder = Decoder(len(target_Vocab), decoder_embed_size, hidden_size, num_layers)
net = EncoderDecoder(encoder, decoder)
num_epoch = 100
lr = 0.001
device = 'cuda'
5.2.训练
train(net, train_iter, lr, num_epoch, device)
# 显示如下:
tensor(0.0147, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.0137, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.0139, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.0128, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.0126, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.0126, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.0123, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.0120, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.0128, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.0121, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.0117, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.0122, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.0119, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(0.0124, device='cuda:0', grad_fn=<MeanBackward0>)
5.3.预测
predict(net, 'He did it just for fun.', source_Vocab, target_Vocab, num_steps, device)