seq2seq模型简单实现

最新推荐文章于 2024-08-22 08:42:14 发布

旭旭_哥

最新推荐文章于 2024-08-22 08:42:14 发布

阅读量2.5k

点赞数

分类专栏：机器学习 python编程

本文链接：https://blog.csdn.net/luoyexuge/article/details/78904471

版权

机器学习同时被 2 个专栏收录

114 篇文章 7 订阅

订阅专栏

python编程

85 篇文章 1 订阅

订阅专栏

理论参考:https://mp.weixin.qq.com/s/qXMRHxDDRa-_rJZMhXWB4w

http://blog.csdn.net/liuchonge/article/details/78824572

http://blog.csdn.net/liuchonge/article/details/78856692

代码参考 https://github.com/NELSONZHAO/zhihu/blob/master/basic_seq2seq/Seq2seq_char.ipynb

不过作者使用的tensorflow1.1的版本，我用的是tensorflow1.3的版本，在返回结果的时候稍微有点不同，数据都来自上面的博客

  
  In [1]: import tensorflow as tf
In [2]: tf.__version__
Out[2]: '1.3.0'

先看下data_utils.py

   
   import numpy as np
source_path="data/letters_source.txt"
target_path="data/letters_target.txt"
def  read_file(source_path,target_path):
    with open(source_path,'r',encoding="utf-8") as  f:
        source_data=f.read()
    with  open(target_path,'r',encoding='utf-8') as f:
        target_data = f.read()
    return source_data,target_data
def extract_character_vocab(data):
    """
    构造映射表
    :param data:
    :return:
    """
    special_words=['<PAD>','<UNK>','<GO>','<EOS>']
    set_words=list(set([ character for line in data.split('\n') for character in line]))
    int_to_vocab={idx:word for idx,word in enumerate(special_words+set_words)}
    vocab_to_int={word:idx for idx ,word in int_to_vocab.items()}
    return int_to_vocab,vocab_to_int
def file_to_ids(source_path="data/letters_source.txt",target_path="data/letters_target.txt"):
    source_data, target_data=read_file(source_path,target_path)
    source_int_to_letter, source_letter_to_int = extract_character_vocab(source_data)
    target_int_to_letter, target_letter_to_int = extract_character_vocab(target_data)
    source_int=[[ source_letter_to_int.get(letter,source_letter_to_int['<UNK>'])
                   for letter in line  ]for line in source_data.split('\n')]
    target_int=[ [target_letter_to_int.get(letter,target_letter_to_int['<UNK>'])
                  for letter in line] +[target_letter_to_int['<EOS>']] for line in target_data.split('\n')]
    return source_int,target_int,source_letter_to_int,target_letter_to_int
def pad_sentence_batch(sentence_batch, pad_int):
    '''
    对batch中的序列进行补全，保证batch中的每行都有相同的sequence_length
    参数：
    - sentence batch
    - pad_int: <PAD>对应索引号
    '''
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int] * (max_sentence - len(sentence)) for sentence in sentence_batch]
def get_batches(targets, sources, batch_size, source_pad_int, target_pad_int):
    '''
    定义生成器，用来获取batch
    '''
    for batch_i in range(0, len(sources) // batch_size):
        start_i = batch_i * batch_size
        sources_batch = sources[start_i:start_i + batch_size]
        targets_batch = targets[start_i:start_i + batch_size]
        # 补全序列
        pad_sources_batch = np.array(pad_sentence_batch(sources_batch, source_pad_int))
        pad_targets_batch = np.array(pad_sentence_batch(targets_batch, target_pad_int))
        # 记录每条记录的长度
        targets_lengths = []
        for target in targets_batch:
            targets_lengths.append(len(target))
        source_lengths = []
        for source in sources_batch:
            source_lengths.append(len(source))
        yield pad_targets_batch, pad_sources_batch, targets_lengths, source_lengths
if __name__=='__main__':
    source_data, target_data=read_file(source_path,target_path)
    print(source_data.split("\n")[0:10])
    print(target_data.split("\n")[0:10])
    #{0: '<PAD>', 1: '<UNK>', 2: '<GO>',
    # 3: '<EOS>', 4: 'v', 5: 'y', 6: 'i', 7: 'e', 8: 'd', 9:
    source_int, target_int,_,_=file_to_ids()
    print(source_int[:10])
    print(target_int[0:10])

config.py:

   
   class Config:
    batch_size = 128   # Batch Size
    rnn_size = 50      # RNN Size
    num_layers = 2     # Number of Layers
    encoding_embedding_size = 15 # Embedding Size
    decoding_embedding_size = 15  # Embedding Size
    learning_rate = 0.001   # Learning Rate
    epochs=100 #模型每隔60步骤打印出loss

model.py

   
   import tensorflow as tf
from tensorflow.python.layers.core import Dense
class Seq2SeqModelBase(object):
    def __init__(self,config,source_letter_to_int,target_letter_to_int):
        self.config=config
        self.learing_rate=self.config.learning_rate
        self.batch_size=self.config.batch_size
        self.encoding_embedding_size=self.config.encoding_embedding_size
        self.decoding_embedding_size=self.config.decoding_embedding_size
        self.rnn_size=self.config.rnn_size
        self.num_layers=self.config.num_layers
        self.source_letter_to_int=source_letter_to_int
        self.target_letter_to_int=target_letter_to_int
        self.inputs=tf.placeholder(tf.int32,[None,None],name='inputs')
        self.targets=tf.placeholder(tf.int32,[None,None],name='target')
        # 定义target序列最大长度
        # （之后target_sequence_length和
        # source_sequence_length会作为feed_dict的参数）
        self.target_sequence_length=tf.placeholder(tf.int32, shape=[None],name='target_sequence_length')
        self.max_target_sequence_length=tf.reduce_max(self.target_sequence_length,name='max_target_len')
        self.source_sequence_length = tf.placeholder(tf.int32, shape=[None], name='source_sequence_length')
        self.train()
    #encode层
    def get_encoder_layer(self,input_data, rnn_size, num_layers,
                          source_sequence_length, source_vocab_size,
                          encoding_embedding_size):
        '''
        构造Encoder层
        参数说明：
        - input_data: 输入tensor
        - rnn_size: rnn隐层结点数量
        - num_layers: 堆叠的rnn cell数量
        - source_sequence_length: 源数据的序列长度
        - source_vocab_size: 源数据的词典大小
        - encoding_embedding_size: embedding的大小
        '''
        # Encoder embedding
        encoder_embed_input = tf.contrib.layers.embed_sequence(input_data, source_vocab_size,
                                                               encoding_embedding_size)
        # RNN cell
        def get_lstm_cell(rnn_size):
            lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size,
                                                initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            return lstm_cell
        cell = tf.contrib.rnn.MultiRNNCell([get_lstm_cell(rnn_size) for _ in range(num_layers)])
        encoder_output, encoder_state = tf.nn.dynamic_rnn(cell, encoder_embed_input,
                                                          sequence_length=source_sequence_length, dtype=tf.float32)
        return encoder_output, encoder_state
     # encode层 对target数据进行预处理
    def process_decoder_input(self,data, vocab_to_int, batch_size):
        '''
        补充<GO>，并移除最后一个字符
        '''
        # cut掉最后一个字符
        ending = tf.strided_slice(data, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)
        return decoder_input
    """
    对数据进行embedding
        同样地，我们还需要对target数据进行embedding，使得它们能够传入Decoder中的RNN
    """
    def decoding_layer(self,target_letter_to_int, decoding_embedding_size, num_layers, rnn_size,
                       target_sequence_length, max_target_sequence_length, encoder_state, decoder_input):
        '''
        构造Decoder层
        参数：
        - target_letter_to_int: target数据的映射表
        - decoding_embedding_size: embed向量大小
        - num_layers: 堆叠的RNN单元数量
        - rnn_size: RNN单元的隐层结点数量
        - target_sequence_length: target数据序列长度
        - max_target_sequence_length: target数据序列最大长度
        - encoder_state: encoder端编码的状态向量
        - decoder_input: decoder端输入
        '''
        # 1. Embedding
        target_vocab_size = len(target_letter_to_int)
        decoder_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
        decoder_embed_input = tf.nn.embedding_lookup(decoder_embeddings, decoder_input)
        # 2. 构造Decoder中的RNN单元
        def get_decoder_cell(rnn_size):
            decoder_cell = tf.contrib.rnn.LSTMCell(rnn_size,
                                                   initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            return decoder_cell
        cell = tf.contrib.rnn.MultiRNNCell([get_decoder_cell(rnn_size) for _ in range(num_layers)])
        # 3. Output全连接层
        output_layer = Dense(target_vocab_size,
                             kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
        # 4. Training decoder
        with tf.variable_scope("decode"):
            # 得到help对象
            training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_embed_input,
                                                                sequence_length=target_sequence_length)
            # 构造decoder
            training_decoder = tf.contrib.seq2seq.BasicDecoder(cell,
                                                               training_helper,
                                                               encoder_state,
                                                               output_layer)
            #training_decoder_output, _,_ = tf.contrib.seq2seq.dynamic_decode(training_decoder)
            training_decoder_output, _,_ = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                                          impute_finished=True,
                                                                          maximum_iterations=max_target_sequence_length)
        # 5. Predicting decoder
        # 与training共享参数
        with tf.variable_scope("decode", reuse=True):
            # 创建一个常量tensor并复制为batch_size的大小
            start_tokens = tf.tile(tf.constant([target_letter_to_int['<GO>']], dtype=tf.int32), [self.batch_size],
                                   name='start_tokens')
            predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embeddings,
                                                                         start_tokens,
                                                                         target_letter_to_int['<EOS>'])
            predicting_decoder = tf.contrib.seq2seq.BasicDecoder(cell,
                                                                 predicting_helper,
                                                                 encoder_state,
                                                                 output_layer)
            predicting_decoder_output, _ ,_= tf.contrib.seq2seq.dynamic_decode(predicting_decoder,
                                                                             impute_finished=True,
                                                                             maximum_iterations=max_target_sequence_length)
        return training_decoder_output, predicting_decoder_output
    def _get_simple_lstm(self, rnn_size, layer_size):
        lstm_layers = [tf.contrib.rnn.LSTMCell(rnn_size) for _ in range(layer_size)]
        return tf.contrib.rnn.MultiRNNCell(lstm_layers)
    """
    Seq2Seq
    上面已经构建完成Encoder和Decoder，下面将这两部分连接起来，构建seq2seq模型
    """
    def seq2seq_model(self,input_data, targets, lr, target_sequence_length,
                      max_target_sequence_length, source_sequence_length,
                      source_vocab_size, target_vocab_size,
                      encoder_embedding_size, decoder_embedding_size,
                      rnn_size, num_layers):
        # 获取encoder的状态输出
        _, encoder_state = self.get_encoder_layer(input_data,
                                             rnn_size,
                                             num_layers,
                                             source_sequence_length,
                                             source_vocab_size,
                                                  encoder_embedding_size)
        # 预处理后的decoder输入
        decoder_input = self.process_decoder_input(targets, self.target_letter_to_int, self.batch_size)
        # 将状态向量与输入传递给decoder
        training_decoder_output, predicting_decoder_output = self.decoding_layer(self.target_letter_to_int,
                                                                                 decoder_embedding_size,
                                                                            num_layers,
                                                                            rnn_size,
                                                                            target_sequence_length,
                                                                            max_target_sequence_length,
                                                                            encoder_state,
                                                                            decoder_input
                                                                    )
        return training_decoder_output, predicting_decoder_output
    def train(self):
        training_decoder_output, predicting_decoder_output = self.seq2seq_model(self.inputs,
                                                                                self.targets,
                                                                                self.learing_rate,
                                                                                self.target_sequence_length,
                                                                                self.max_target_sequence_length,
                                                                                self.source_sequence_length,
                                                                           len(self.source_letter_to_int),
                                                                           len(self.target_letter_to_int),
                                                                                self.encoding_embedding_size,
                                                                                self.decoding_embedding_size,
                                                                                self.rnn_size,
                                                                                self.num_layers)
        training_logits = tf.identity(training_decoder_output.rnn_output, 'logits')
        predicting_logits = tf.identity(predicting_decoder_output.sample_id, name='predictions')
        masks = tf.sequence_mask(self.target_sequence_length, self.max_target_sequence_length, dtype=tf.float32, name='masks')
        print(' training_logits:{}'.format(training_logits.get_shape()))
        print(' self.targets:{}'.format(self.targets.get_shape()))
        self.cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            self.targets,
            masks)
        # targets = tf.reshape(self.targets, [-1])
        # predicting_logits = tf.identity(predicting_decoder_output.sample_id, name='predictions')
        # logits_flat = tf.reshape(training_decoder_output.rnn_output, [-1, len(self.target_letter_to_int)])
        # print('shape logits_flat:{}'.format(logits_flat.shape))
        # print('shape logits:{}'.format(training_decoder_output.rnn_output.shape))
        #self.cost = tf.losses.sparse_softmax_cross_entropy(labels=targets, logits=logits_flat)
        # Optimizer
        optimizer = tf.train.AdamOptimizer(self.learing_rate)
        # Gradient Clipping
        gradients = optimizer.compute_gradients(self.cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        self.train_op = optimizer.apply_gradients(capped_gradients)

run.py

   
   import tensorflow as tf
from data_utils import file_to_ids,get_batches
from configuration import  Config
from model import  Seq2SeqModelBase
config=Config()
batch_size=config.batch_size
source_int,target_int,source_letter_to_int,target_letter_to_int=file_to_ids()
model=Seq2SeqModelBase(config,source_letter_to_int,target_letter_to_int)
# 将数据集分割为train和validation
train_source = source_int[batch_size:]
train_target = target_int[batch_size:]
# 留出一个batch进行验证
valid_source = source_int[:batch_size]
valid_target = target_int[:batch_size]
(valid_targets_batch, valid_sources_batch, valid_targets_lengths, valid_sources_lengths) = next(get_batches(valid_target, valid_source, batch_size,
                           source_letter_to_int['<PAD>'],
                           target_letter_to_int['<PAD>']))
display_step = 50  # 每隔50轮输出loss
checkpoint = "modelsave/trained_model.ckpt"
# 构造graph
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch_i in range(1, config.epochs + 1):
        for batch_i, (targets_batch, sources_batch, targets_lengths, sources_lengths) in enumerate(
                get_batches(train_target, train_source, batch_size,
                            source_letter_to_int['<PAD>'],
                            target_letter_to_int['<PAD>'])):
            _, loss = sess.run(
                [model.train_op, model.cost],
                feed_dict={model.inputs: sources_batch,
                 model.targets: targets_batch,
                 model.target_sequence_length: targets_lengths,
                 model.source_sequence_length: sources_lengths})
            if batch_i % display_step == 0 and batch_i>0 :
                # 计算validation loss
                validation_loss = sess.run(
                    [model.cost],
                    feed_dict={model.inputs: valid_sources_batch,
                     model.targets: valid_targets_batch,
                     model.target_sequence_length: valid_targets_lengths,
                     model.source_sequence_length: valid_sources_lengths})
                print('Epoch {:>3}/{} Batch {:>4}/{} - Training Loss: {:>6.3f}  '
                      '- Validation loss: {:>6.3f}'.
                      format(epoch_i,config.epochs,batch_i,len(train_source) // batch_size,loss,validation_loss[0]))
    # 保存模型
    saver = tf.train.Saver()
    saver.save(sess, checkpoint)
    print('Model Trained and Saved')

run结果:

   
   2017-12-26 17:06:15.971564: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
2017-12-26 17:06:15.971732: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
2017-12-26 17:06:15.971741: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed up CPU computations.
2017-12-26 17:06:15.971747: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use FMA instructions, but these are available on your machine and could speed up CPU computations.
Epoch   1/100 Batch   50/77 - Training Loss:  2.885  - Validation loss:  2.884
Epoch   2/100 Batch   50/77 - Training Loss:  2.137  - Validation loss:  2.146
Epoch   3/100 Batch   50/77 - Training Loss:  1.783  - Validation loss:  1.801
Epoch   4/100 Batch   50/77 - Training Loss:  1.528  - Validation loss:  1.537
Epoch   5/100 Batch   50/77 - Training Loss:  1.320  - Validation loss:  1.307
Epoch   6/100 Batch   50/77 - Training Loss:  1.123  - Validation loss:  1.111
Epoch   7/100 Batch   50/77 - Training Loss:  0.961  - Validation loss:  0.952
Epoch   8/100 Batch   50/77 - Training Loss:  0.827  - Validation loss:  0.826
Epoch   9/100 Batch   50/77 - Training Loss:  0.726  - Validation loss:  0.733
Epoch  10/100 Batch   50/77 - Training Loss:  0.643  - Validation loss:  0.642
Epoch  11/100 Batch   50/77 - Training Loss:  0.576  - Validation loss:  0.563
Epoch  12/100 Batch   50/77 - Training Loss:  0.518  - Validation loss:  0.492
Epoch  13/100 Batch   50/77 - Training Loss:  0.460  - Validation loss:  0.435
Epoch  14/100 Batch   50/77 - Training Loss:  0.401  - Validation loss:  0.382
Epoch  15/100 Batch   50/77 - Training Loss:  0.349  - Validation loss:  0.330
Epoch  16/100 Batch   50/77 - Training Loss:  0.303  - Validation loss:  0.282
Epoch  17/100 Batch   50/77 - Training Loss:  0.261  - Validation loss:  0.240
Epoch  18/100 Batch   50/77 - Training Loss:  0.226  - Validation loss:  0.203

预测：

   
   from data_utils import  extract_character_vocab,read_file
import tensorflow as tf
batch_size=128
source_path="data/letters_source.txt"
target_path="data/letters_target.txt"
source_data, target_data=read_file(source_path,target_path)
target_int_to_letter, target_letter_to_int = extract_character_vocab(target_data)
source_int_to_letter, source_letter_to_int = extract_character_vocab(source_data)
def source_to_seq(text):
    '''
    对源数据进行转换
    '''
    sequence_length = 7
    return [source_letter_to_int.get(word, source_letter_to_int['<UNK>']) for word in text] + [source_letter_to_int['<PAD>']]*(sequence_length-len(text))
input_word = 'xxegcbj'
text = source_to_seq(input_word)
checkpoint = "modelsave/trained_model.ckpt"
loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # 加载模型
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)
    input_data = loaded_graph.get_tensor_by_name('inputs:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    source_sequence_length = loaded_graph.get_tensor_by_name('source_sequence_length:0')
    target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_length:0')
    answer_logits = sess.run(logits, {input_data: [text] * batch_size,
                                      target_sequence_length: [len(input_word)] * batch_size,
                                      source_sequence_length: [len(input_word)] * batch_size})[0]
print("answer_logits is :",answer_logits)
pad = source_letter_to_int["<PAD>"]
print('原始输入:', input_word)
print('\nSource')
print('  Word 编号:    {}'.format([i for i in text]))
print('  Input Words: {}'.format(" ".join([source_int_to_letter[i] for i in text])))
print('\nTarget')
print('  Word 编号:       {}'.format([i for i in answer_logits if i != pad]))
print('  Response Words: {}'.format(" ".join([target_int_to_letter[i] for i in answer_logits if i != pad])))