实现一个简单的seq2seq模型,基于我的上篇博客
数据处理
1.数据预处理
with open('letters_source.txt', 'r', encoding='utf-8') as f:
source_data = f.read()
with open('letters_target.txt', 'r', encoding='utf-8') as f:
target_data = f.read()
def extract_character_vocab(data):
set_words = list(set([character for line in data.split('\n') for character in line]))
int_to_vocab = {idx: word for idx, word in enumerate(set_words)}
vocab_to_int = {word: idx for idx, word in int_to_vocab.items()}
vocab_to_int[int_to_vocab[0]] = len(int_to_vocab)
vocab_to_int[''] = 0
int_to_vocab[len(int_to_vocab)] = int_to_vocab[0]
int_to_vocab[0] = ''
return int_to_vocab, vocab_to_int
# 构造映射表
source_int_to_letter, source_letter_to_int = extract_character_vocab(source_data)
target_int_to_letter, target_letter_to_int = extract_character_vocab(target_data)
# 对字母进行转换
source_int = [[source_letter_to_int.get(letter)
for letter in line] for line in source_data.split('\n')]
target_int = [[target_letter_to_int.get(letter)
for letter in line] for line in target_data.split('\n')]
vocab_size = len(source_int_to_letter)
#计算input和output的最长序列长度,以便进行序列补全
max_length_source = 0
max_length_target = 0
for i in source_int:
if len(i)>max_length_source:
max_length_source = len(i)
for i in target_int:
if len(i)>max_length_target:
max_length_target = len(i)
sources = [i + [0]*(max_length_source-len(i)) for i in source_int]
targets = [i + [0]*(max_length_target-len(i)) for i in target_int]
print(sources[:10])
print(targets[:10])
2.获取batch
def get_batches(targets, sources, batch_size):
for batch_i in range(0, len(sources)//batch_size):
start_i = batch_i * batch_size
sources_batch = sources[start_i:start_i + batch_size]
targets_batch = targets[start_i:start_i + batch_size]
yield np.array(targets_batch), np.array(sources_batch)
构建模型
1.参数设置以及输入输出设置
memory_size = 9
batch_size = 50
seq_input = 7#s输入长度
seq_output = 7#输出长度
num_layers = 3
emb_size = 27
learning_rates = 0.1
epochs = 100
learning_rate = tf.placeholder(tf.float32, name='learning_rate')
x = tf.placeholder(tf.int32, [batch_size,seq_input], name='inputs')
y = tf.placeholder(tf.int32, [batch_size,seq_output], name='targets')
print(x.shape)
x_emb = tf.one_hot(x, depth=27, axis=1)
#y_emb = tf.contrib.layers.embed_sequence(ids=y,vocab_size=vocab_size,embed_dim=emb_size)
x_emb = tf.unstack(x_emb, axis=2)
print(x_emb)
init_memory = tf.zeros([batch_size, memory_size])
pre_output = tf.zeros([batch_size, memory_size])
2.encoder层
#encoder的lstm单元
'''
encoder的lstm单元
layer_sum: 层数序号
input_size: 输入的一个词向量的长度
rnn_inputs: 输入 [seq_input, batch_size, input_size]
pre_output: 上个lstm单元的输出 [seq_input, batch_size, memory_size]
memory:记忆[batch_size, memory_size]
'''
def en_lstm_cell(layer_num, input_size, pre_output, memory, rnn_inputs):
layer_num = str(layer_num)
memory_out = []
pre_out = []
for rnn_input in rnn_inputs:
with tf.variable_scope('input_gate'+ layer_num,reuse=tf.AUTO_REUSE):
wi = tf.get_variable('wi', shape=[input_size+memory_size, memory_size])
bi = tf.get_variable('bi', shape=[memory_size], initializer=tf.constant_initializer(0.0))
wc = tf.get_variable('wc', shape=[input_size+memory_size, memory_size])
ci = tf.get_variable('ci', shape=[memory_size], initializer=tf.constant_initializer(0.0))
it = tf.sigmoid(tf.matmul(tf.concat([rnn_input,pre_output], 1),wi)+bi)
ic = tf.tanh(tf.matmul(tf.concat([rnn_input,pre_output], 1),wc)+ci)
with tf.variable_scope('forget_gate'+ layer_num,reuse=tf.AUTO_REUSE):
wf = tf.get_variable('wf', shape=[input_size+memory_size, memory_size])
bf = tf.get_variable('bf', shape=[memory_size], initializer=tf.constant_initializer(0.0))
ft = tf.sigmoid(tf.matmul(tf.concat([rnn_input,pre_output], 1),wf)+bf)
with tf.variable_scope('memory'+ layer_num,reuse=tf.AUTO_REUSE):
memory = memory*ft+ic*it#更新memory
with tf.variable_scope('output_gate'+ layer_num,reuse=tf.AUTO_REUSE):
wo = tf.get_variable('wo', shape=[input_size+memory_size, memory_size])
bo = tf.get_variable('bo', shape=[memory_size], initializer=tf.constant_initializer(0.0))
ot = tf.sigmoid(tf.matmul(tf.concat([rnn_input,pre_output], 1),wo)+bo)
pre_output = ot*tf.tanh(memory)#更新前一时刻的输入
memory_out.append(memory)
pre_out.append(pre_output)
return memory_out, pre_out
3.decoder层
在第一层时,没有输入,所以不需要进行concat操作,后面的层会把前面decoder的层的输出作为输入,因此需要与encoder层的最后一个输出进行concat
'''
layer_num: 层数序号
input_size: 输入的向量长度(在除第一层外的几层使用)
num_en_cells: 输入的序列长度,即encoder层的lstm_cell个数
num_de_cells: 输出的序列长度,即decoder层的lstm_cell个数
pre_output: 上一个cell的输出
memory: encoder层的记忆[num_en_cells, batch_size, memory_size]
rnn_inputs: 上一层的输出
'''
#decoder的lstm单元
def de_lstm_cell(layer_num, input_size, num_en_cells, num_de_cells, pre_output, memory, rnn_inputs=None, batch_size=batch_size, memory_size=memory_size):
pre_outputs = []
layer_num = str(layer_num)
if layer_num=='0':
print('y')
for _ in range(num_de_cells):
attention_w = tf.Variable(tf.random_uniform(shape=(num_en_cells, 1, 1),dtype=tf.float32))
attention_w = tf.tile(attention_w,[1, batch_size, memory_size])
memory = tf.reduce_sum(attention_w*memory, 0)
with tf.variable_scope('de_output_gate'+ layer_num,reuse=tf.AUTO_REUSE):
wo = tf.get_variable('wo', shape=[memory_size, memory_size])
bo = tf.get_variable('bo', shape=[memory_size], initializer=tf.constant_initializer(0.0))
ot = tf.sigmoid(tf.matmul(pre_output,wo)+bo)
ht = ot*tf.tanh(memory)
pre_outputs.append(ht)
else:
print('n')
for rnn_input in rnn_inputs:
attention_w = tf.Variable(tf.random_uniform(shape=(num_en_cells, 1, 1),dtype=tf.float32))
attention_w = tf.tile(attention_w,[1, batch_size, memory_size])
memory = tf.reduce_sum(attention_w*memory, 0)
with tf.variable_scope('de_output_gate'+ layer_num,reuse=tf.AUTO_REUSE):
wo = tf.get_variable('wo', shape=[input_size+memory_size, memory_size])
bo = tf.get_variable('bo', shape=[memory_size], initializer=tf.constant_initializer(0.0))
ot = tf.sigmoid(tf.matmul(tf.concat([pre_output, rnn_input], 1),wo)+bo)
ht = ot*tf.tanh(memory)
pre_outputs.append(ht)
return pre_outputs
"""
这个函数会让encoder层的语义向量继续变化,导致decoder层的输出影响语义向量,又因为本身又与下一步的输出有关系,序列较长时,语义偏差会较大
因此不可用
def de_lstm_cell(pre_output, memory):
with tf.variable_scope('de_input_gate',reuse=tf.AUTO_REUSE):
wi = tf.get_variable('wi', shape=[memory_size, memory_size])
bi = tf.get_variable('bi', shape=[memory_size], initializer=tf.constant_initializer(0.0))
wc = tf.get_variable('wc', shape=[memory_size, memory_size])
ci = tf.get_variable('ci', shape=[memory_size], initializer=tf.constant_initializer(0.0))
it = tf.sigmoid(tf.matmul(pre_output,wi)+bi)
ic = tf.tanh(tf.matmul(pre_output,wc)+ci)
with tf.variable_scope('de_forget_gate',reuse=tf.AUTO_REUSE):
wf = tf.get_variable('wf', shape=[memory_size, memory_size])
bf = tf.get_variable('bf', shape=[memory_size], initializer=tf.constant_initializer(0.0))
ft = tf.sigmoid(tf.matmul(pre_output,wf)+bf)
with tf.variable_scope('de_memory',reuse=tf.AUTO_REUSE):
c = memory*ft+ic*it
with tf.variable_scope('de_output_gate',reuse=tf.AUTO_REUSE):
wo = tf.get_variable('wo', shape=[memory_size, memory_size])
bo = tf.get_variable('bo', shape=[memory_size], initializer=tf.constant_initializer(0.0))
ot = tf.sigmoid(tf.matmul(pre_output,wo)+bo)
ht = ot*tf.tanh(c)
return ht, c
"""
4.构建多层seq2seq
#num_layers:层数
for lay in range(num_layers):
memory_out, x_emb = en_lstm_cell(lay, emb_size, pre_output, init_memory, x_emb)
final_preout = x_emb[-1]
emb_size = final_preout.shape[1]
if lay==0:
pre_out_de = de_lstm_cell(lay, memory_size, seq_input, seq_output, final_preout, memory_out )
else:
pre_out_de = de_lstm_cell(lay, memory_size, seq_input, seq_output, final_preout, memory_out, pre_out_de)
5.softmax层
简单的softmax层
with tf.variable_scope('softmax'):
W = tf.get_variable('W', [memory_size, 27])
b = tf.get_variable('b', [emb_size], initializer=tf.constant_initializer(0.0))
predicts = tf.argmax(tf.unstack([tf.matmul(seq_out ,W) for seq_out in pre_out_de],axis=1),2)
logits = tf.unstack([tf.matmul(seq_out ,W) for seq_out in pre_out_de],axis=1)
6.损失计算以及优化器设置
y_emb = tf.unstack(y, axis=0)
total_loss = tf.reduce_mean([tf.nn.sparse_softmax_cross_entropy_with_logits(labels=label, logits=logit) for logit, label in zip(logits, y_emb)])
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(total_loss)
模型训练
predict = []
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
training_losses = []
for i in range(0,epochs):
print("\nEPOCH", i)
training_loss = 0
training_memory = np.zeros((batch_size, memory_size))
for step, (X,Y) in enumerate(get_batches(sources, targets, batch_size)):
predict_step, logit, training_loss_, _ = sess.run( [predicts, logits, total_loss, train_step], feed_dict={x:X, y:Y, init_memory:training_memory, learning_rate:learning_rates})
training_loss += training_loss_
if step % 100 == 0 and step > 0:
print("Average loss at step", step,
"for last 100 steps:", training_loss/100)
training_losses.append(training_loss/100)
training_loss = 0
predict.extend(predict_step)
plt.plot(training_losses)
plt.show()
#预测
out = [[source_int_to_letter[i] for i in j if i!= 0] for j in predict]
out = [''.join(i) for i in out]
加了attention的训练结果如下:
ps:实现的是一个N vs M的seq2seq模型,即输入都是同一个长度,输出也是,输入输出可以不等长。
pss:原本想实现一个每个batch之间的N和M可以变化的模型,结果在lstm单元遍历那儿出了问题。
原因:原本为了实现N和M可以变化,所以把输入的数据设置为了是[batch_size,None]的形状,经过emb后变成了[batch_size,None,emb_size],然后因为第二维是None,所以没法进行unstack,改成transpose实现了维度转换,又发现因为维度不确定,没法遍历,就只能完成这个简单的了。希望下次能完成可以变化N和M的吧。