前言
使用的百度的paddle框架,在AIstudio上面运行本次任务。
一、导入相关库
#一、导入相关库
import paddle
import paddle.nn.functional as F
import re
import numpy as np
二、读取数据
MAX_LEN=20
#1.准备数据
def read_data():
#用于读取数据
data=[]
lines = open('./work/cmn.txt', encoding='utf-8').read().strip().split('\n') #用open来读取数据
words_re = re.compile(r'\w+') #用于把英文句子分解成单词的正则匹配项
for each in lines:
en_sent, cn_sent, _ = each.split('\t')
data.append((words_re.findall(en_sent.lower()),list(cn_sent)))
data_filtered=[]
for each in data:
#选取中英文句子长度均小于20的样本
if len(each[0]) < MAX_LEN and len(each[1]) < MAX_LEN:
data_filtered.append(each)
return data_filtered
def build_vocab(data):
#用于构建词典
eng_vocab = {} #英文词典
chn_vocab = {} #中文词典
#分别在词典中添加:<pad>代表填充词,<bos>代表开始词,<eos>代表结束词
eng_vocab['<pad>'], eng_vocab['<bos>'], eng_vocab['<eos>'] = 0, 1, 2
chn_vocab['<pad>'], chn_vocab['<bos>'], chn_vocab['<eos>'] = 0, 1, 2
#迭代data,一旦发现新词便加进词典里
eng_idx, chn_idx = 3, 3
for eng, chn in data:
for word in eng:
if word not in eng_vocab:
eng_vocab[word] = eng_idx
eng_idx += 1
for word in chn:
if word not in chn_vocab:
chn_vocab[word] = chn_idx
chn_idx += 1
return eng_vocab, chn_vocab
data = read_data() #读取数据
eng_vocab, chn_vocab = build_vocab(data) #根据数据构建词典
三、数据预处理
#填充句子
padded_eng_sents = []
padded_chn_sents = []
padded_chn_label_sents = []
for eng, chn in data:
#给每个英文句子结尾加上<eos>,并且把不足MAX_LEN单词数量的英文句子填充<pad>
padded_eng_sent = eng + ['<eos>'] + ['<pad>'] * (MAX_LEN - len(eng))
#给每个中文句子开头加上<bos>、结尾加上<eos>,并且把不足MAX_LEN个词数量的句子填充<pad>
padded_chn_sent = ['<bos>'] + chn + ['<eos>'] + ['<pad>'] * (MAX_LEN - len(chn))
padded_chn_label_sent = chn + ['<eos>'] + ['<pad>'] * (MAX_LEN - len(chn) + 1)
#根据字典,把句子中的单词转成字典中相对应的数字
padded_eng_sents.append([eng_vocab[w] for w in padded_eng_sent])
padded_chn_sents.append([chn_vocab[w] for w in padded_chn_sent])
padded_chn_label_sents.append([chn_vocab[w] for w in padded_chn_label_sent])
train_eng_sents = np.array(padded_eng_sents).astype('int64')
train_chn_sents = np.array(padded_chn_sents).astype('int64')
train_chn_label_sents = np.array(padded_chn_label_sents).astype('int64')
四、构建模型
embedding_size = 128
hidden_size = 256
epochs = 50
batch_size = 64
eng_vocab_size = len(list(eng_vocab))
chn_vocab_size = len(list(chn_vocab))
#编码器
class Encoder(paddle.nn.Layer):
def __init__(self):
super(Encoder, self).__init__()
#词向量层
self.embed = paddle.nn.Embedding(eng_vocab_size, embedding_size)
#长短期记忆网络层
self.lstm = paddle.nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, num_layers=1)
def forward(self, x):
#输入数据形状大小为[批量数,时间步长]
x = self.embed(x)
#经过词嵌入层,输出形状大小为[批量数,时间步长,词向量维度(embedding_size)].其中,时间步长=MAX_LEN+1
x, (_, _) = self.lstm(x)
#经过长短期记忆网络层,输出形状大小为:[批量数,时间步长,隐藏层维度(hidden_size)].其中,时间步长=MAX_LEN+1
return x
#解码器
class Decoder(paddle.nn.Layer):
def __init__(self):
super(Decoder, self).__init__()
#词嵌入层
self.embed = paddle.nn.Embedding(chn_vocab_size, embedding_size)
#长短期记忆网络层
self.lstm = paddle.nn.LSTM(input_size=embedding_size + hidden_size, hidden_size=hidden_size)
#注意力计算函数
self.attention_linear1 = paddle.nn.Linear(hidden_size * 2, hidden_size)
self.attention_linear2 = paddle.nn.Linear(hidden_size, 1)
self.linear =paddle.nn.Linear(hidden_size, chn_vocab_size)
def forward(self, x, previous_hidden, previous_cell, encoder_outputs):
#输入数据x的形状大小为[批量数, 1]
#上个时间步的隐藏层previous_hidden形状大小为[批量数, 1, 隐藏层维度(hidden_size)]
#上个时间步的单元previous_cell形状大小为[批量数, 1, 隐藏层维度(hidden_size)]
#编码器在各时间步隐藏状态encoder_outputs形状大小为[批量数,时间步长,隐藏层维度(hidden_size)]
#输入编码器的是英文句子,每句的长度为MAX_LEN+1,加了一个结束符<eos>
x = self.embed(x)
#经过词嵌入层,输出形状大小为[批量数,1,词向量维度(embedding_size)]
#把编码器在各个时间部的隐藏状态与解码器的上一时间步的隐藏状态拼接起来
#编码器在各时间步隐藏状态encoder_outputs形状大小为[批量数,时间步长,隐藏层维度(hidden_size)]
#而解码器在上个时间步的隐藏层previous_hidden的形状大小为[批量数, 1, 隐藏层维度(hidden_size)]
#需要用paddle.tile方法对previous_hidden在时间步维度进行复制扩展
#之后,用paddle.concat方法把encoder_outputs和扩展后的previous_hidden在最后一个维度进行拼接
#输出attention_inputs的形状大小变为[批量数,时间步长,隐藏层维度*2]
attention_inputs = paddle.concat((encoder_outputs, paddle.tile(previous_hidden, repeat_times=[1, MAX_LEN+1, 1])), axis=-1)
#采用单隐藏层的多层感知机进行变换
attention_hidden = self.attention_linear1(attention_inputs)
attention_hidden = F.tanh(attention_hidden)
attention_logits = self.attention_linear2(attention_hidden)
#此时的输出形状大小为[批量数,时间步长,1]
attention_logits = paddle.squeeze(attention_logits) #删除输入Tensor的Shape中尺寸为1的维度
#此时的输出形状大小为[批量数,时间步长]
#利用softmax运算得到注意力权重,形状大小为[批量数,时间步长],每个取值在0至1之间,它是在时间维取权重。
attention_weights = F.softmax(attention_logits)
#编码器在各时间步隐藏状态encoder_outputs形状大小为[批量数,时间步长,隐藏层维度(hidden_size)]
#而注意力权重的形状大小为[批量数,时间步长],因此需要用paddle.unsqueeze方法对注意力权重增加一个维度
#接着,使用paddle.expand_as方法把注意力权重扩展成encoder_outputs的形状
attention_weights = paddle.expand_as(paddle.unsqueeze(attention_weights, -1), encoder_outputs)
#逐元素相乘得到背景向量
context_vector = paddle.multiply(encoder_outputs, attention_weights)
#此时的背景向量形状大小为[批量数,时间步长,隐藏层维度]
#接着对背景向量在时间步求和
context_vector = paddle.sum(context_vector, 1)
#此时的背景向量形状大小为[批量数,隐藏层维度]
context_vector = paddle.unsqueeze(context_vector, 1) #在第1维插入尺寸为1的维度
#此时的背景向量形状大小为[批量数,1,隐藏层维度]
#经过词嵌入层,输出x形状大小为[批量数,1,词向量维度(embedding_size)]
#把x与背景向量在最后一个维度上拼接起来,得到形状大小为[批量数,1,词向量维度+隐藏层维度]
lstm_input = paddle.concat((x, context_vector), axis=-1)
#上个时间步的隐藏层previous_hidden形状大小转变为[1,批量数, 隐藏层维度(hidden_size)]
previous_hidden = paddle.transpose(previous_hidden, [1, 0, 2])
#上个时间步的单元previous_cell形状大小转变为[1,批量数, 隐藏层维度(hidden_size)]
previous_cell = paddle.transpose(previous_cell, [1, 0, 2])
#数据输入长短期记忆网络层
x, (hidden, cell) = self.lstm(lstm_input, (previous_hidden, previous_cell))
hidden = paddle.transpose(hidden, [1, 0, 2])
cell = paddle.transpose(cell, [1, 0, 2])
#经过上述转置,当前时间步隐藏层输出形状大小为[批量数,1,隐藏层维度]
output = self.linear(hidden)
#此时,输出形状大小为[批量数,1,中文词典大小]
output = paddle.squeeze(output) #删除输入Tensor的Shape中尺寸为1的维度
#此时,输出形状大小为[批量数, 中文词典大小]
return output, (hidden, cell)
五、训练模型
encoder = Encoder() #生成编码器实例
decoder = Decoder() #生成解码器实例
#优化器
optimizer = paddle.optimizer.Adam(learning_rate=0.001, parameters=encoder.parameters()+decoder.parameters())
#进行训练
for epoch in range(epochs):
print("第{}轮训练开始...".format(epoch))
#打乱数据顺序
order = np.random.permutation(len(train_eng_sents))
train_eng_sents_shuffled = train_eng_sents[order]
train_chn_sents_shuffled = train_chn_sents[order]
train_chn_label_sents_shuffled = train_chn_label_sents[order]
for iteration in range(train_eng_sents_shuffled.shape[0] // batch_size):
eng_sentence = train_eng_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]
eng_sentence = paddle.to_tensor(eng_sentence)
encoder_outputs = encoder(eng_sentence)
x_chn_data = train_chn_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]
x_chn_label_data = train_chn_label_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]
# shape: (batch, num_layer(=1 here) * num_of_direction(=1 here), hidden_size)
hidden = paddle.zeros([batch_size, 1, hidden_size])
cell = paddle.zeros([batch_size, 1, hidden_size])
loss = paddle.zeros([1])
#循环调用解码器,每次喂入一个时间步的批量数据
for i in range(MAX_LEN + 2):
chn_word = paddle.to_tensor(x_chn_data[:,i:i+1])
chn_word_label = paddle.to_tensor(x_chn_label_data[:,i])
logits, (hidden, cell) = decoder(chn_word, hidden, cell, encoder_outputs)
step_loss = F.cross_entropy(logits, chn_word_label)
loss += step_loss
loss = loss / (MAX_LEN + 2)
if(iteration % 200 == 0):
print("iter {}, loss:{}".format(iteration, loss.numpy()))
loss.backward()
optimizer.step()
optimizer.clear_grad()
六、模型预测
encoder.eval()
decoder.eval()
num_of_exampels_to_evaluate = 10
indices = np.random.choice(len(train_eng_sents), num_of_exampels_to_evaluate, replace=False)
x_data = train_eng_sents[indices]
sent = paddle.to_tensor(x_data)
en_repr = encoder(sent)
word = np.array([[chn_vocab['<bos>']]] * num_of_exampels_to_evaluate)
word = paddle.to_tensor(word)
hidden = paddle.zeros([num_of_exampels_to_evaluate, 1, hidden_size])
cell = paddle.zeros([num_of_exampels_to_evaluate, 1, hidden_size])
decoded_sent = []
for i in range(MAX_LEN + 2):
logits, (hidden, cell) = decoder(word, hidden, cell, en_repr)
word = paddle.argmax(logits, axis=1)
decoded_sent.append(word.numpy())
word = paddle.unsqueeze(word, axis=-1)
results = np.stack(decoded_sent, axis=1)
for i in range(num_of_exampels_to_evaluate):
en_input = " ".join(data[indices[i]][0])
ground_truth_translate = "".join(data[indices[i]][1])
model_translate = ""
for k in results[i]:
w = list(chn_vocab)[k]
if w != '<pad>' and w != '<eos>':
model_translate += w
print(en_input)
print("true: {}".format(ground_truth_translate))
print("pred: {}".format(model_translate))
七、效果
写在最后
仅对个人的深度学习实验做一次记录,文中不足、错误之处欢迎指正;
创作不易,点个赞吧!