seq2seq+attention使用tensorflow实现不调用接口

最新推荐文章于 2021-06-09 14:42:44 发布

jiang199912

最新推荐文章于 2021-06-09 14:42:44 发布

阅读量191

点赞数

分类专栏： nlp一些个人理解与实现文章标签： tensorflow python 深度学习自然语言处理神经网络

本文链接：https://blog.csdn.net/jiang199912/article/details/105418846

版权

nlp一些个人理解与实现专栏收录该内容

12 篇文章 2 订阅

订阅专栏

实现一个简单的seq2seq模型，基于我的上篇博客

数据处理

1.数据预处理

with open('letters_source.txt', 'r', encoding='utf-8') as f:
    source_data = f.read()

with open('letters_target.txt', 'r', encoding='utf-8') as f:
    target_data = f.read()

def extract_character_vocab(data):
    set_words = list(set([character for line in data.split('\n') for character in line]))
    int_to_vocab = {idx: word for idx, word in enumerate(set_words)}
    vocab_to_int = {word: idx for idx, word in int_to_vocab.items()}
    vocab_to_int[int_to_vocab[0]] = len(int_to_vocab)
    vocab_to_int[''] = 0
    int_to_vocab[len(int_to_vocab)] = int_to_vocab[0]
    int_to_vocab[0] = ''
    
    return int_to_vocab, vocab_to_int

# 构造映射表
source_int_to_letter, source_letter_to_int = extract_character_vocab(source_data)
target_int_to_letter, target_letter_to_int = extract_character_vocab(target_data)

# 对字母进行转换
source_int = [[source_letter_to_int.get(letter) 
               for letter in line] for line in source_data.split('\n')]
target_int = [[target_letter_to_int.get(letter) 
               for letter in line] for line in target_data.split('\n')] 
vocab_size = len(source_int_to_letter)

#计算input和output的最长序列长度，以便进行序列补全
max_length_source = 0
max_length_target = 0
for i in source_int:
    if len(i)>max_length_source:
        max_length_source = len(i)
for i in target_int:
    if len(i)>max_length_target:
        max_length_target = len(i)
sources = [i + [0]*(max_length_source-len(i)) for i in source_int]
targets = [i + [0]*(max_length_target-len(i)) for i in target_int]
print(sources[:10])
print(targets[:10])

在这里插入图片描述

2.获取batch

def get_batches(targets, sources, batch_size):
    for batch_i in range(0, len(sources)//batch_size):
        start_i = batch_i * batch_size
        
        sources_batch = sources[start_i:start_i + batch_size]
        targets_batch = targets[start_i:start_i + batch_size]
        
        yield np.array(targets_batch), np.array(sources_batch)

构建模型

1.参数设置以及输入输出设置

memory_size = 9
batch_size = 50
seq_input = 7#s输入长度
seq_output = 7#输出长度
num_layers = 3
emb_size = 27
learning_rates = 0.1
epochs = 100

learning_rate = tf.placeholder(tf.float32, name='learning_rate')
x = tf.placeholder(tf.int32, [batch_size,seq_input], name='inputs')
y = tf.placeholder(tf.int32, [batch_size,seq_output], name='targets')
print(x.shape)
x_emb = tf.one_hot(x, depth=27, axis=1)
#y_emb = tf.contrib.layers.embed_sequence(ids=y,vocab_size=vocab_size,embed_dim=emb_size)
x_emb = tf.unstack(x_emb, axis=2)
print(x_emb)

init_memory = tf.zeros([batch_size, memory_size])
pre_output = tf.zeros([batch_size, memory_size])

2.encoder层

#encoder的lstm单元
'''
encoder的lstm单元
layer_sum: 层数序号
input_size: 输入的一个词向量的长度
rnn_inputs: 输入 [seq_input, batch_size, input_size]
pre_output: 上个lstm单元的输出 [seq_input, batch_size, memory_size]
memory：记忆[batch_size, memory_size]
'''
def en_lstm_cell(layer_num, input_size, pre_output, memory, rnn_inputs):
    layer_num = str(layer_num)
    memory_out = []
    pre_out = []
    for rnn_input in rnn_inputs:
        with tf.variable_scope('input_gate'+ layer_num,reuse=tf.AUTO_REUSE):
            wi = tf.get_variable('wi', shape=[input_size+memory_size, memory_size])
            bi = tf.get_variable('bi', shape=[memory_size], initializer=tf.constant_initializer(0.0))
            wc = tf.get_variable('wc', shape=[input_size+memory_size, memory_size])
            ci = tf.get_variable('ci', shape=[memory_size], initializer=tf.constant_initializer(0.0))
            it = tf.sigmoid(tf.matmul(tf.concat([rnn_input,pre_output], 1),wi)+bi)
            ic = tf.tanh(tf.matmul(tf.concat([rnn_input,pre_output], 1),wc)+ci)

        with tf.variable_scope('forget_gate'+ layer_num,reuse=tf.AUTO_REUSE):
            wf = tf.get_variable('wf', shape=[input_size+memory_size, memory_size])
            bf = tf.get_variable('bf', shape=[memory_size], initializer=tf.constant_initializer(0.0))
            ft = tf.sigmoid(tf.matmul(tf.concat([rnn_input,pre_output], 1),wf)+bf)

        with tf.variable_scope('memory'+ layer_num,reuse=tf.AUTO_REUSE):
            memory = memory*ft+ic*it#更新memory

        with tf.variable_scope('output_gate'+ layer_num,reuse=tf.AUTO_REUSE):
            wo = tf.get_variable('wo', shape=[input_size+memory_size, memory_size])
            bo = tf.get_variable('bo', shape=[memory_size], initializer=tf.constant_initializer(0.0))
            ot = tf.sigmoid(tf.matmul(tf.concat([rnn_input,pre_output], 1),wo)+bo) 
            pre_output = ot*tf.tanh(memory)#更新前一时刻的输入
        
        memory_out.append(memory)
        pre_out.append(pre_output)
    return memory_out, pre_out

3.decoder层

在第一层时，没有输入，所以不需要进行concat操作，后面的层会把前面decoder的层的输出作为输入，因此需要与encoder层的最后一个输出进行concat

'''
layer_num: 层数序号
input_size: 输入的向量长度（在除第一层外的几层使用）
num_en_cells: 输入的序列长度，即encoder层的lstm_cell个数
num_de_cells: 输出的序列长度，即decoder层的lstm_cell个数
pre_output: 上一个cell的输出
memory: encoder层的记忆[num_en_cells, batch_size, memory_size]
rnn_inputs: 上一层的输出
'''
#decoder的lstm单元
def de_lstm_cell(layer_num, input_size, num_en_cells, num_de_cells, pre_output, memory,  rnn_inputs=None, batch_size=batch_size, memory_size=memory_size):
    pre_outputs = []
    layer_num = str(layer_num)
    if layer_num=='0':
        print('y')
        for _ in range(num_de_cells):
            attention_w = tf.Variable(tf.random_uniform(shape=(num_en_cells, 1, 1),dtype=tf.float32))
            attention_w = tf.tile(attention_w,[1, batch_size, memory_size])
            memory = tf.reduce_sum(attention_w*memory, 0)
            with tf.variable_scope('de_output_gate'+ layer_num,reuse=tf.AUTO_REUSE):
                wo = tf.get_variable('wo', shape=[memory_size, memory_size])
                bo = tf.get_variable('bo', shape=[memory_size], initializer=tf.constant_initializer(0.0))
                ot = tf.sigmoid(tf.matmul(pre_output,wo)+bo) 
                ht = ot*tf.tanh(memory)
            pre_outputs.append(ht)
    else:     
        print('n')
        for rnn_input in rnn_inputs:
            attention_w = tf.Variable(tf.random_uniform(shape=(num_en_cells, 1, 1),dtype=tf.float32))
            attention_w = tf.tile(attention_w,[1, batch_size, memory_size])
            memory = tf.reduce_sum(attention_w*memory, 0)
            with tf.variable_scope('de_output_gate'+ layer_num,reuse=tf.AUTO_REUSE):
                wo = tf.get_variable('wo', shape=[input_size+memory_size, memory_size])
                bo = tf.get_variable('bo', shape=[memory_size], initializer=tf.constant_initializer(0.0))
                ot = tf.sigmoid(tf.matmul(tf.concat([pre_output, rnn_input], 1),wo)+bo) 
                ht = ot*tf.tanh(memory)
            pre_outputs.append(ht)
    return pre_outputs
"""
这个函数会让encoder层的语义向量继续变化，导致decoder层的输出影响语义向量，又因为本身又与下一步的输出有关系，序列较长时，语义偏差会较大
因此不可用
def de_lstm_cell(pre_output, memory):
    with tf.variable_scope('de_input_gate',reuse=tf.AUTO_REUSE):
        wi = tf.get_variable('wi', shape=[memory_size, memory_size])
        bi = tf.get_variable('bi', shape=[memory_size], initializer=tf.constant_initializer(0.0))
        wc = tf.get_variable('wc', shape=[memory_size, memory_size])
        ci = tf.get_variable('ci', shape=[memory_size], initializer=tf.constant_initializer(0.0))
        it = tf.sigmoid(tf.matmul(pre_output,wi)+bi) 
        ic = tf.tanh(tf.matmul(pre_output,wc)+ci)
    
    with tf.variable_scope('de_forget_gate',reuse=tf.AUTO_REUSE):
        wf = tf.get_variable('wf', shape=[memory_size, memory_size])
        bf = tf.get_variable('bf', shape=[memory_size], initializer=tf.constant_initializer(0.0))
        ft = tf.sigmoid(tf.matmul(pre_output,wf)+bf)
    
    with tf.variable_scope('de_memory',reuse=tf.AUTO_REUSE):
        c = memory*ft+ic*it
    
    with tf.variable_scope('de_output_gate',reuse=tf.AUTO_REUSE):
        wo = tf.get_variable('wo', shape=[memory_size, memory_size])
        bo = tf.get_variable('bo', shape=[memory_size], initializer=tf.constant_initializer(0.0))
        ot = tf.sigmoid(tf.matmul(pre_output,wo)+bo) 
        ht = ot*tf.tanh(c)
    return ht, c
"""

4.构建多层seq2seq

#num_layers：层数
for lay in range(num_layers):
    memory_out, x_emb = en_lstm_cell(lay, emb_size, pre_output, init_memory, x_emb)
    final_preout = x_emb[-1]
    emb_size = final_preout.shape[1]
    if lay==0:
        pre_out_de = de_lstm_cell(lay, memory_size, seq_input, seq_output, final_preout, memory_out )
    else:
        pre_out_de = de_lstm_cell(lay, memory_size, seq_input, seq_output, final_preout, memory_out, pre_out_de)

5.softmax层

简单的softmax层

with tf.variable_scope('softmax'):
    W = tf.get_variable('W', [memory_size, 27])
    b = tf.get_variable('b', [emb_size], initializer=tf.constant_initializer(0.0))
predicts = tf.argmax(tf.unstack([tf.matmul(seq_out ,W) for seq_out in pre_out_de],axis=1),2)
logits = tf.unstack([tf.matmul(seq_out ,W) for seq_out in pre_out_de],axis=1)

6.损失计算以及优化器设置

y_emb = tf.unstack(y, axis=0)
total_loss = tf.reduce_mean([tf.nn.sparse_softmax_cross_entropy_with_logits(labels=label, logits=logit) for logit, label in zip(logits, y_emb)])
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(total_loss)

模型训练

predict = []
with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        training_losses = []
        for i in range(0,epochs):
            print("\nEPOCH", i)
            training_loss = 0
            training_memory = np.zeros((batch_size, memory_size))
            
            for step, (X,Y) in enumerate(get_batches(sources, targets, batch_size)):
                predict_step, logit, training_loss_, _ = sess.run( [predicts, logits, total_loss, train_step], feed_dict={x:X, y:Y, init_memory:training_memory, learning_rate:learning_rates})
                training_loss += training_loss_
                if step % 100 == 0 and step > 0:
                    print("Average loss at step", step,
                              "for last 100 steps:", training_loss/100)
                    training_losses.append(training_loss/100)
                    training_loss = 0
            predict.extend(predict_step)
        plt.plot(training_losses)
        plt.show()

#预测
out = [[source_int_to_letter[i] for i in j if i!= 0] for j in predict]
out = [''.join(i) for i in out]

加了attention的训练结果如下：
在这里插入图片描述

ps:实现的是一个N vs M的seq2seq模型，即输入都是同一个长度，输出也是，输入输出可以不等长。

pss：原本想实现一个每个batch之间的N和M可以变化的模型，结果在lstm单元遍历那儿出了问题。
原因：原本为了实现N和M可以变化，所以把输入的数据设置为了是[batch_size,None]的形状，经过emb后变成了[batch_size,None,emb_size],然后因为第二维是None，所以没法进行unstack，改成transpose实现了维度转换，又发现因为维度不确定，没法遍历，就只能完成这个简单的了。希望下次能完成可以变化N和M的吧。