本篇论文是论文Attention Is All You Need (Transformer)的Tensorflow实现。在github上面找到了一个Tensorflow的实现代码,我跑了数据处理部分,模型训练我的笔记本跑不起来,所以只跑了一个step得到训练前的翻译情况和对应损失,和大家一起分析下别人跑出来的结果。
具体模型可以参考我的另一篇博客
参数定义:
我们首先建立一个flags.py的文件用于存储后续需要的参数
import tensorflow as tf
# app parameter
tf.app.flags.DEFINE_string('mode', 'train', 'mode to train/test')
tf.app.flags.DEFINE_string('dataset', 'dummy', 'dataset')
# model parameter
tf.app.flags.DEFINE_integer('stack_num', 6, 'stack num')
tf.app.flags.DEFINE_integer('d_model', 512, 'model dimension')
tf.app.flags.DEFINE_integer('d_k', 64, 'key dim')
tf.app.flags.DEFINE_integer('d_v', 64, 'value dim')
tf.app.flags.DEFINE_integer('h', 8, 'stack of multihead attention')
# train parmeter
tf.app.flags.DEFINE_integer('num_epochs', 5, 'num epochs')
tf.app.flags.DEFINE_integer('batch_size', 32, 'batch size')
tf.app.flags.DEFINE_float('dropout_keep', 0.9, 'dropout keep rate')
tf.app.flags.DEFINE_integer('pad_length', 60, 'pad length')
tf.app.flags.DEFINE_float('learn_rate', 1e-4, 'learn rate')
tf.app.flags.DEFINE_boolean('use_pretrained_vec', False, 'flag for pretrained vector')
FLAGS = tf.app.flags.FLAGS
数据预处理:
我们采用评估活动IWSLT机器翻译任务的数据集:IWSLT 2016 German–English parallel corpus,它们是用于构建和测试MT系统的并行数据集。建立preporcess.py对数据进行处理:
- 训练集包括“train.tags.de-en.en”(196884句英语),“train.tags.de-en.de”(393768句德语)
- 我们取出所有出现过的单词构建词汇表:58641+2个英语单词,126797+2个德语单词
- 根据词汇表,我们对每句话进行单词转化索引以及padding为等长pad_length的句子表示,方便后续映射
import re
import tensorflow as tf
import operator
from flags import FLAGS
word_dict_a = {'<EOS>': 0, '<SOS>': 1}
word_dict_b = {'<EOS>': 0, '<SOS>': 1}
def parser(data):
# 对数据集的每句话进行正则化处理
TOKENIZER_RE = re.compile(r"[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+", re.UNICODE)
return TOKENIZER_RE.findall(data.lower())
def transform(arr, word_dict):
# 将训练集中的单词转化为索引,方便后续映射层的处理
result = []
for i in range(len(arr)):
if arr[i] not in word_dict:
word_dict[arr[i]] = len(word_dict)
result.append(word_dict[arr[i]])
return result
def padding(arr):
# 对句子进行padding,使每句话等长
pad_length = FLAGS.pad_length
result = tf.keras.preprocessing.sequence.pad_sequences([arr], pad_length, padding='post')
return result[0]
# 定义对训练集进行数据预处理
FLAGS.dataset = 'IWSLT16'
if FLAGS.dataset == 'dummy':
file_base = './data/dummy/'
if FLAGS.mode == 'train':
file_a = file_base+'train.a.txt'
file_b = file_base+'train.b.txt'
file_a_out = file_base+'train.a.ids.txt'
file_b_out = file_base+'train.b.ids.txt'
file_vocab_a = file_base+'vocab.a.txt'
file_vocab_b = file_base+'vocab.b.txt'
elif FLAGS.dataset == 'IWSLT16':
file_base = './data/IWSLT16/'
if FLAGS.mode == 'train':
file_a = file_base+'train.tags.de-en.en'
file_b = file_base+'train.tags.de-en.de'
file_a_out = file_base+'train.en.ids.txt'
file_b_out = file_base+'train.de.ids.txt'
file_vocab_a = file_base+'vocab.en.txt'
file_vocab_b = file_base+'vocab.de.txt'
for fin, fout, word_dict in [(file_a, file_a_out, word_dict_a),
(file_b, file_b_out, word_dict_b)]:
# 数据预处理:将训练集中的单词转化为索引并进行padding
with open(fout, 'w') as f_out:
with open(fin,'r',encoding='gbk',errors='ignore') as f:
for line in f:
if len(line) > 0 and line[0] == '<':
continue
word_ids = map(lambda x: str(x), padding(transform(parser(line), word_dict)))
f_out.write(' '.join(word_ids) + '\n')
for file_vocab, word_dict in [(file_vocab_a, word_dict_a),
(file_vocab_b, word_dict_b)]:
# 根据词典里的单词构建词汇表
with open(file_vocab, 'w') as f_out:
keys = list(map(lambda x: x[0],
sorted(word_dict.items(), key=operator.itemgetter(1))))
for key in keys:
f_out.write(key + '\n')
模型
模型搭建及代码注释
import tensorflow as tf
import numpy as np
from flags import FLAGS
'''
Transformer modules
'''
'''
residual connection
'''
def add_and_norm(x, sub_x):
with tf.variable_scope('add_and_norm'):
sub_x = tf.nn.dropout(sub_x, FLAGS.dropout_keep)
# LayerNorm(x+Sublayer(x))
return tf.contrib.layers.layer_norm(x + sub_x)
def feed_forward(x, d_ff=2048):
output_dim = x.get_shape()[-1]
with tf.variable_scope('feed_forward'):
x = tf.layers.dense(x, d_ff, activation=tf.nn.relu)
x = tf.layers.dense(x, output_dim)
return x
'''
head_i=Attention(QW^Q,KW^K,VW^V)
'''
def multihead_attention_block(vk_input, q_input,
batch_size, pad_length, d_model, d_k, d_v, masked=False):
with tf.variable_scope('multihead_attention'):
K = tf.layers.dense(vk_input, d_k, name='K', activation=tf.nn.relu) # W^K
V = tf.layers.dense(vk_input, d_v, name='V', activation=tf.nn.relu) # W^V
Q = tf.layers.dense(q_input, d_k, name='Q', activation=tf.nn.relu) # W^Q
'''
Scaled Dot-Product Attention
'''
# Mask (pad_length x pad_length)
mask = tf.ones([pad_length, pad_length])
if masked == True:
mask = tf.linalg.LinearOperatorLowerTriangular(mask, tf.float32).to_dense()
#mask = tf.contrib.linalg.LinearOperatorTriL(mask, tf.float32).to_dense()
mask = tf.reshape(tf.tile(mask, [batch_size, 1]),
[batch_size, pad_length, pad_length])
# Attention(Q,K,V)=softmax[(QK^T)/d_k^(1/2)]V
attn = tf.nn.softmax(
mask * (Q @ tf.transpose(K, [0, 2, 1])) / tf.sqrt(tf.to_float(d_k))) @ V
return attn
'''
MultiHead(Q,K,V)=Concat(head_1,...,head_h)W^O
'''
def multihead_attention(vk_input, q_input, masked=False):
outputs = []
pad_length = FLAGS.pad_length
batch_size = tf.shape(vk_input)[0]
d_model = FLAGS.d_model
d_k = FLAGS.d_k
d_v = FLAGS.d_v
h = FLAGS.h
for i in range(h):
outputs.append(
multihead_attention_block(vk_input, q_input,
batch_size, pad_length, d_model, d_k, d_v, masked=masked))
outputs = tf.concat(outputs, axis=2)
outputs = tf.layers.dense(outputs, d_model)
return outputs
'''
Transformer Encoder block
两个子层:① multi-head self-attention机制 ② 全连接前向网络
子层使用 residual connection,然后进行 layer normalization
'''
def encoder_block(inputs):
# load hyper parameters
with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE):
flow = multihead_attention(inputs, inputs) # encoder self-attention
flow = add_and_norm(inputs, flow)
flow = add_and_norm(flow, feed_forward(flow)) # 全连接前向网络
return flow
'''
Transformer Decoder block
三个子层:① multi-head self-attention机制 ② encoder-decoder attention ③ 全连接前向网络
子层使用 residual connection,然后进行 layer normalization
'''
def decoder_block(outputs, encoder_outputs):
# load hyper parameters
with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE):
flow = multihead_attention(outputs, outputs, masked=True) # decoder self-attention
flow = add_and_norm(outputs, flow)
flow = add_and_norm(flow, multihead_attention(encoder_outputs, flow)) # encoder-decoder attention
flow = add_and_norm(flow, feed_forward(flow))
return flow
'''
Positional Encoding
PE(pos,2i)=sin[pos/10000^(2i/d_model)]
PE(pos,2i+1)=cos[pos/10000^(2i/d_model)]
其中pos->位置,i->维度
'''
def positional_encoding(x):
pad_length = FLAGS.pad_length
d_model = FLAGS.d_model
def sincos(x, i):
if i%2 == 0:
return np.sin(x)
return np.cos(x)
with tf.variable_scope('positional_encoding'):
pe = tf.convert_to_tensor([sincos(pos/(10000**(2*i/d_model)), i)
for pos in range(1, pad_length+1) for i in range(1, d_model+1)])
pe = tf.reshape(pe, [-1, pad_length, d_model])
return tf.add(x, pe)
'''
Transformer class
'''
class Transformer(object):
def __init__(self, inputs=None, outputs=None, sparse_outputs=None):
pad_length = FLAGS.pad_length
d_model = FLAGS.d_model
# 定义encoder输入inputs的占位符
if inputs is None:
self.inputs = tf.placeholder(tf.float32, shape=[None, pad_length, d_model])
else:
self.inputs = inputs
# 定义decoder输入outputs的占位符
if outputs is None:
self.outputs = tf.placeholder(tf.float32, shape=[None, pad_length, d_model])
else:
self.outputs = outputs
if sparse_outputs is None:
self.sparse_outputs = tf.placeholder(tf.int32, shape=[None, pad_length])
else:
self.sparse_outputs = sparse_outputs
def build_graph(self, output_dim):
pad_length = FLAGS.pad_length
N = FLAGS.stack_num
learn_rate = FLAGS.learn_rate
with tf.variable_scope('transformer'):
# inputs (encoder), outputs (decoder) 输入进行 embedding 和 positional encoding
inputs = positional_encoding(self.inputs)
outputs = positional_encoding(self.outputs)
# 六个相同的编码层
for i in range(N):
with tf.variable_scope('enc_b_' + str(i)):
inputs = encoder_block(inputs)
# 六个相同的解码层
for i in range(N):
with tf.variable_scope('dec_b_' + str(i)):
outputs = decoder_block(outputs, inputs)
# 利用 线性函数 和 softmax函数
# 将 decoder 的输出转换成 预测的next-token概率
with tf.variable_scope('projection'):
self.logits = tf.layers.dense(outputs, output_dim)
self.predict = tf.argmax(self.logits, axis=2)
# 对损失进行定义以及mask处理
with tf.variable_scope('loss'):
EOS_ID = 0
target_lengths = tf.reduce_sum(
tf.to_int32(tf.not_equal(self.sparse_outputs, EOS_ID)), 1) + 1
seq_mask = tf.sequence_mask(lengths=target_lengths,
maxlen=pad_length,
dtype=tf.float32)
y_ = tf.one_hot(self.sparse_outputs, depth=output_dim)
self.debug = self.logits
ys = y_.get_shape().as_list()[-1]
y_ = ((1-0.1) * y_) + (0.1 / ys)
self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=y_)*seq_mask
self.loss = tf.reduce_sum(self.loss, axis=1) / tf.to_float(target_lengths)
self.loss = tf.reduce_mean(self.loss)
tf.summary.scalar('loss', self.loss)
# 采用自适应优化器AdamOptimizer
optimizer = tf.train.AdamOptimizer(learn_rate, beta1=0.9, beta2=0.98, epsilon=1e-8)
self.optimize_op = optimizer.minimize(self.loss)
# 将loss进行记录,方便在tensorboard中观察模型训练结果
self.summary_op = tf.summary.merge_all()
模型训练
import tensorflow as tf
from flags import FLAGS
from model import Transformer
from reader import data, source_vocab, target_vocab
input_vocab_size = len(source_vocab) # encoder输入inputs的词典
output_vocab_size = len(target_vocab) # decoder输入outputs的词典
# 数据初始化
initializer = tf.contrib.layers.xavier_initializer()
embedding_i = tf.get_variable('embedding_i', shape=[input_vocab_size,
FLAGS.d_model], initializer=initializer)
embedding_o = tf.get_variable('embedding_o', shape=[output_vocab_size,
FLAGS.d_model], initializer=initializer)
# 获取下一个数据对,得到对应的输入输出
inputs_op, outputs_op = data.get_next()
embed_inputs_op = tf.nn.embedding_lookup(embedding_i, inputs_op)
embed_outputs_op = tf.nn.embedding_lookup(embedding_o, outputs_op)
# 加载 Transformer
if FLAGS.use_pretrained_vec == True:
model = Transformer()
else:
model = Transformer(inputs=embed_inputs_op, outputs=embed_outputs_op,
sparse_outputs=outputs_op)
model.build_graph(output_vocab_size)
# 训练过程
with tf.Session() as sess:
sess.run([tf.global_variables_initializer(), data.initializer])
train_writer = tf.summary.FileWriter('./summary/train', sess.graph)
step = 0
feed_dict = {}
while True:
try:
# 模型读取训练数据
# 利用优化器训练模型得到对应summary,loss,predict
if FLAGS.use_pretrained_vec == True:
inputs, outputs, embed_inputs, embed_outputs = sess.run(
[inputs_op, outputs_op, embed_inputs_op, embed_outputs_op])
feed_dict = {model.inputs: embed_inputs,
model.outputs: embed_outputs, model.sparse_outputs: outputs}
_, summary, loss, predict = sess.run([model.optimize_op,
model.summary_op, model.loss, model.predict],
feed_dict=feed_dict)
else:
_, summary, loss, predict, inputs, outputs = sess.run([model.optimize_op,
model.summary_op, model.loss, model.predict, inputs_op, outputs_op],
feed_dict=feed_dict)
if step % 77 == 0:
train_writer.add_summary(summary, step)
predict = predict.tolist()
original = []
result = []
for p_i in predict[0]:
result.append(target_vocab[p_i])
for p_i in outputs[0]:
original.append(target_vocab[p_i])
if '<EOS>' in result:
result = result[:result.index('<EOS>')]
if '<EOS>' in original:
original = original[:original.index('<EOS>')]
original = ' '.join(original)
result = ' '.join(result)
print('step:'+str(step)+', loss: ' + str(loss))
print(original)
print(result)
print('---')
step += 1
except tf.errors.OutOfRangeError:
print('train done')
break
结果
我跑了模型没有训练时的翻译结果和对应loss: