seq2seq模型简单实现

理论参考:https://mp.weixin.qq.com/s/qXMRHxDDRa-_rJZMhXWB4w

http://blog.csdn.net/liuchonge/article/details/78824572

http://blog.csdn.net/liuchonge/article/details/78856692




代码参考 https://github.com/NELSONZHAO/zhihu/blob/master/basic_seq2seq/Seq2seq_char.ipynb


不过作者使用的tensorflow1.1的版本,我用的是tensorflow1.3的版本,在返回结果的时候稍微有点不同,数据都来自上面的博客


  
  
  1. In [1]: import tensorflow as tf
  2. In [2]: tf.__version__
  3. Out[2]: '1.3.0'

先看下data_utils.py

   
   
  1. import numpy as np
  2. source_path="data/letters_source.txt"
  3. target_path="data/letters_target.txt"
  4. def read_file(source_path,target_path):
  5. with open(source_path,'r',encoding="utf-8") as f:
  6. source_data=f.read()
  7. with open(target_path,'r',encoding='utf-8') as f:
  8. target_data = f.read()
  9. return source_data,target_data
  10. def extract_character_vocab(data):
  11. """
  12. 构造映射表
  13. :param data:
  14. :return:
  15. """
  16. special_words=['<PAD>','<UNK>','<GO>','<EOS>']
  17. set_words=list(set([ character for line in data.split('\n') for character in line]))
  18. int_to_vocab={idx:word for idx,word in enumerate(special_words+set_words)}
  19. vocab_to_int={word:idx for idx ,word in int_to_vocab.items()}
  20. return int_to_vocab,vocab_to_int
  21. def file_to_ids(source_path="data/letters_source.txt",target_path="data/letters_target.txt"):
  22. source_data, target_data=read_file(source_path,target_path)
  23. source_int_to_letter, source_letter_to_int = extract_character_vocab(source_data)
  24. target_int_to_letter, target_letter_to_int = extract_character_vocab(target_data)
  25. source_int=[[ source_letter_to_int.get(letter,source_letter_to_int['<UNK>'])
  26. for letter in line ]for line in source_data.split('\n')]
  27. target_int=[ [target_letter_to_int.get(letter,target_letter_to_int['<UNK>'])
  28. for letter in line] +[target_letter_to_int['<EOS>']] for line in target_data.split('\n')]
  29. return source_int,target_int,source_letter_to_int,target_letter_to_int
  30. def pad_sentence_batch(sentence_batch, pad_int):
  31. '''
  32. batch中的序列进行补全,保证batch中的每行都有相同的sequence_length
  33. 参数:
  34. - sentence batch
  35. - pad_int: <PAD>对应索引号
  36. '''
  37. max_sentence = max([len(sentence) for sentence in sentence_batch])
  38. return [sentence + [pad_int] * (max_sentence - len(sentence)) for sentence in sentence_batch]
  39. def get_batches(targets, sources, batch_size, source_pad_int, target_pad_int):
  40. '''
  41. 定义生成器,用来获取batch
  42. '''
  43. for batch_i in range(0, len(sources) // batch_size):
  44. start_i = batch_i * batch_size
  45. sources_batch = sources[start_i:start_i + batch_size]
  46. targets_batch = targets[start_i:start_i + batch_size]
  47. # 补全序列
  48. pad_sources_batch = np.array(pad_sentence_batch(sources_batch, source_pad_int))
  49. pad_targets_batch = np.array(pad_sentence_batch(targets_batch, target_pad_int))
  50. # 记录每条记录的长度
  51. targets_lengths = []
  52. for target in targets_batch:
  53. targets_lengths.append(len(target))
  54. source_lengths = []
  55. for source in sources_batch:
  56. source_lengths.append(len(source))
  57. yield pad_targets_batch, pad_sources_batch, targets_lengths, source_lengths
  58. if __name__=='__main__':
  59. source_data, target_data=read_file(source_path,target_path)
  60. print(source_data.split("\n")[0:10])
  61. print(target_data.split("\n")[0:10])
  62. #{0: '<PAD>', 1: '<UNK>', 2: '<GO>',
  63. # 3: '<EOS>', 4: 'v', 5: 'y', 6: 'i', 7: 'e', 8: 'd', 9:
  64. source_int, target_int,_,_=file_to_ids()
  65. print(source_int[:10])
  66. print(target_int[0:10])

config.py:

   
   
  1. class Config:
  2. batch_size = 128 # Batch Size
  3. rnn_size = 50 # RNN Size
  4. num_layers = 2 # Number of Layers
  5. encoding_embedding_size = 15 # Embedding Size
  6. decoding_embedding_size = 15 # Embedding Size
  7. learning_rate = 0.001 # Learning Rate
  8. epochs=100 #模型每隔60步骤打印出loss

model.py

   
   
  1. import tensorflow as tf
  2. from tensorflow.python.layers.core import Dense
  3. class Seq2SeqModelBase(object):
  4. def __init__(self,config,source_letter_to_int,target_letter_to_int):
  5. self.config=config
  6. self.learing_rate=self.config.learning_rate
  7. self.batch_size=self.config.batch_size
  8. self.encoding_embedding_size=self.config.encoding_embedding_size
  9. self.decoding_embedding_size=self.config.decoding_embedding_size
  10. self.rnn_size=self.config.rnn_size
  11. self.num_layers=self.config.num_layers
  12. self.source_letter_to_int=source_letter_to_int
  13. self.target_letter_to_int=target_letter_to_int
  14. self.inputs=tf.placeholder(tf.int32,[None,None],name='inputs')
  15. self.targets=tf.placeholder(tf.int32,[None,None],name='target')
  16. # 定义target序列最大长度
  17. # (之后target_sequence_length和
  18. # source_sequence_length会作为feed_dict的参数)
  19. self.target_sequence_length=tf.placeholder(tf.int32, shape=[None],name='target_sequence_length')
  20. self.max_target_sequence_length=tf.reduce_max(self.target_sequence_length,name='max_target_len')
  21. self.source_sequence_length = tf.placeholder(tf.int32, shape=[None], name='source_sequence_length')
  22. self.train()
  23. #encode层
  24. def get_encoder_layer(self,input_data, rnn_size, num_layers,
  25. source_sequence_length, source_vocab_size,
  26. encoding_embedding_size):
  27. '''
  28. 构造Encoder层
  29. 参数说明:
  30. - input_data: 输入tensor
  31. - rnn_size: rnn隐层结点数量
  32. - num_layers: 堆叠的rnn cell数量
  33. - source_sequence_length: 源数据的序列长度
  34. - source_vocab_size: 源数据的词典大小
  35. - encoding_embedding_size: embedding的大小
  36. '''
  37. # Encoder embedding
  38. encoder_embed_input = tf.contrib.layers.embed_sequence(input_data, source_vocab_size,
  39. encoding_embedding_size)
  40. # RNN cell
  41. def get_lstm_cell(rnn_size):
  42. lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size,
  43. initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
  44. return lstm_cell
  45. cell = tf.contrib.rnn.MultiRNNCell([get_lstm_cell(rnn_size) for _ in range(num_layers)])
  46. encoder_output, encoder_state = tf.nn.dynamic_rnn(cell, encoder_embed_input,
  47. sequence_length=source_sequence_length, dtype=tf.float32)
  48. return encoder_output, encoder_state
  49. # encode层 对target数据进行预处理
  50. def process_decoder_input(self,data, vocab_to_int, batch_size):
  51. '''
  52. 补充<GO>,并移除最后一个字符
  53. '''
  54. # cut掉最后一个字符
  55. ending = tf.strided_slice(data, [0, 0], [batch_size, -1], [1, 1])
  56. decoder_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)
  57. return decoder_input
  58. """
  59. 对数据进行embedding
  60. 同样地,我们还需要对target数据进行embedding,使得它们能够传入Decoder中的RNN
  61. """
  62. def decoding_layer(self,target_letter_to_int, decoding_embedding_size, num_layers, rnn_size,
  63. target_sequence_length, max_target_sequence_length, encoder_state, decoder_input):
  64. '''
  65. 构造Decoder层
  66. 参数:
  67. - target_letter_to_int: target数据的映射表
  68. - decoding_embedding_size: embed向量大小
  69. - num_layers: 堆叠的RNN单元数量
  70. - rnn_size: RNN单元的隐层结点数量
  71. - target_sequence_length: target数据序列长度
  72. - max_target_sequence_length: target数据序列最大长度
  73. - encoder_state: encoder端编码的状态向量
  74. - decoder_input: decoder端输入
  75. '''
  76. # 1. Embedding
  77. target_vocab_size = len(target_letter_to_int)
  78. decoder_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
  79. decoder_embed_input = tf.nn.embedding_lookup(decoder_embeddings, decoder_input)
  80. # 2. 构造Decoder中的RNN单元
  81. def get_decoder_cell(rnn_size):
  82. decoder_cell = tf.contrib.rnn.LSTMCell(rnn_size,
  83. initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
  84. return decoder_cell
  85. cell = tf.contrib.rnn.MultiRNNCell([get_decoder_cell(rnn_size) for _ in range(num_layers)])
  86. # 3. Output全连接层
  87. output_layer = Dense(target_vocab_size,
  88. kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
  89. # 4. Training decoder
  90. with tf.variable_scope("decode"):
  91. # 得到help对象
  92. training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_embed_input,
  93. sequence_length=target_sequence_length)
  94. # 构造decoder
  95. training_decoder = tf.contrib.seq2seq.BasicDecoder(cell,
  96. training_helper,
  97. encoder_state,
  98. output_layer)
  99. #training_decoder_output, _,_ = tf.contrib.seq2seq.dynamic_decode(training_decoder)
  100. training_decoder_output, _,_ = tf.contrib.seq2seq.dynamic_decode(training_decoder,
  101. impute_finished=True,
  102. maximum_iterations=max_target_sequence_length)
  103. # 5. Predicting decoder
  104. # 与training共享参数
  105. with tf.variable_scope("decode", reuse=True):
  106. # 创建一个常量tensor并复制为batch_size的大小
  107. start_tokens = tf.tile(tf.constant([target_letter_to_int['<GO>']], dtype=tf.int32), [self.batch_size],
  108. name='start_tokens')
  109. predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embeddings,
  110. start_tokens,
  111. target_letter_to_int['<EOS>'])
  112. predicting_decoder = tf.contrib.seq2seq.BasicDecoder(cell,
  113. predicting_helper,
  114. encoder_state,
  115. output_layer)
  116. predicting_decoder_output, _ ,_= tf.contrib.seq2seq.dynamic_decode(predicting_decoder,
  117. impute_finished=True,
  118. maximum_iterations=max_target_sequence_length)
  119. return training_decoder_output, predicting_decoder_output
  120. def _get_simple_lstm(self, rnn_size, layer_size):
  121. lstm_layers = [tf.contrib.rnn.LSTMCell(rnn_size) for _ in range(layer_size)]
  122. return tf.contrib.rnn.MultiRNNCell(lstm_layers)
  123. """
  124. Seq2Seq
  125. 上面已经构建完成Encoder和Decoder,下面将这两部分连接起来,构建seq2seq模型
  126. """
  127. def seq2seq_model(self,input_data, targets, lr, target_sequence_length,
  128. max_target_sequence_length, source_sequence_length,
  129. source_vocab_size, target_vocab_size,
  130. encoder_embedding_size, decoder_embedding_size,
  131. rnn_size, num_layers):
  132. # 获取encoder的状态输出
  133. _, encoder_state = self.get_encoder_layer(input_data,
  134. rnn_size,
  135. num_layers,
  136. source_sequence_length,
  137. source_vocab_size,
  138. encoder_embedding_size)
  139. # 预处理后的decoder输入
  140. decoder_input = self.process_decoder_input(targets, self.target_letter_to_int, self.batch_size)
  141. # 将状态向量与输入传递给decoder
  142. training_decoder_output, predicting_decoder_output = self.decoding_layer(self.target_letter_to_int,
  143. decoder_embedding_size,
  144. num_layers,
  145. rnn_size,
  146. target_sequence_length,
  147. max_target_sequence_length,
  148. encoder_state,
  149. decoder_input
  150. )
  151. return training_decoder_output, predicting_decoder_output
  152. def train(self):
  153. training_decoder_output, predicting_decoder_output = self.seq2seq_model(self.inputs,
  154. self.targets,
  155. self.learing_rate,
  156. self.target_sequence_length,
  157. self.max_target_sequence_length,
  158. self.source_sequence_length,
  159. len(self.source_letter_to_int),
  160. len(self.target_letter_to_int),
  161. self.encoding_embedding_size,
  162. self.decoding_embedding_size,
  163. self.rnn_size,
  164. self.num_layers)
  165. training_logits = tf.identity(training_decoder_output.rnn_output, 'logits')
  166. predicting_logits = tf.identity(predicting_decoder_output.sample_id, name='predictions')
  167. masks = tf.sequence_mask(self.target_sequence_length, self.max_target_sequence_length, dtype=tf.float32, name='masks')
  168. print(' training_logits:{}'.format(training_logits.get_shape()))
  169. print(' self.targets:{}'.format(self.targets.get_shape()))
  170. self.cost = tf.contrib.seq2seq.sequence_loss(
  171. training_logits,
  172. self.targets,
  173. masks)
  174. # targets = tf.reshape(self.targets, [-1])
  175. # predicting_logits = tf.identity(predicting_decoder_output.sample_id, name='predictions')
  176. # logits_flat = tf.reshape(training_decoder_output.rnn_output, [-1, len(self.target_letter_to_int)])
  177. # print('shape logits_flat:{}'.format(logits_flat.shape))
  178. # print('shape logits:{}'.format(training_decoder_output.rnn_output.shape))
  179. #self.cost = tf.losses.sparse_softmax_cross_entropy(labels=targets, logits=logits_flat)
  180. # Optimizer
  181. optimizer = tf.train.AdamOptimizer(self.learing_rate)
  182. # Gradient Clipping
  183. gradients = optimizer.compute_gradients(self.cost)
  184. capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
  185. self.train_op = optimizer.apply_gradients(capped_gradients)


run.py

   
   
  1. import tensorflow as tf
  2. from data_utils import file_to_ids,get_batches
  3. from configuration import Config
  4. from model import Seq2SeqModelBase
  5. config=Config()
  6. batch_size=config.batch_size
  7. source_int,target_int,source_letter_to_int,target_letter_to_int=file_to_ids()
  8. model=Seq2SeqModelBase(config,source_letter_to_int,target_letter_to_int)
  9. # 将数据集分割为train和validation
  10. train_source = source_int[batch_size:]
  11. train_target = target_int[batch_size:]
  12. # 留出一个batch进行验证
  13. valid_source = source_int[:batch_size]
  14. valid_target = target_int[:batch_size]
  15. (valid_targets_batch, valid_sources_batch, valid_targets_lengths, valid_sources_lengths) = next(get_batches(valid_target, valid_source, batch_size,
  16. source_letter_to_int['<PAD>'],
  17. target_letter_to_int['<PAD>']))
  18. display_step = 50 # 每隔50轮输出loss
  19. checkpoint = "modelsave/trained_model.ckpt"
  20. # 构造graph
  21. with tf.Session() as sess:
  22. sess.run(tf.global_variables_initializer())
  23. for epoch_i in range(1, config.epochs + 1):
  24. for batch_i, (targets_batch, sources_batch, targets_lengths, sources_lengths) in enumerate(
  25. get_batches(train_target, train_source, batch_size,
  26. source_letter_to_int['<PAD>'],
  27. target_letter_to_int['<PAD>'])):
  28. _, loss = sess.run(
  29. [model.train_op, model.cost],
  30. feed_dict={model.inputs: sources_batch,
  31. model.targets: targets_batch,
  32. model.target_sequence_length: targets_lengths,
  33. model.source_sequence_length: sources_lengths})
  34. if batch_i % display_step == 0 and batch_i>0 :
  35. # 计算validation loss
  36. validation_loss = sess.run(
  37. [model.cost],
  38. feed_dict={model.inputs: valid_sources_batch,
  39. model.targets: valid_targets_batch,
  40. model.target_sequence_length: valid_targets_lengths,
  41. model.source_sequence_length: valid_sources_lengths})
  42. print('Epoch {:>3}/{} Batch {:>4}/{} - Training Loss: {:>6.3f} '
  43. '- Validation loss: {:>6.3f}'.
  44. format(epoch_i,config.epochs,batch_i,len(train_source) // batch_size,loss,validation_loss[0]))
  45. # 保存模型
  46. saver = tf.train.Saver()
  47. saver.save(sess, checkpoint)
  48. print('Model Trained and Saved')

run结果:

   
   
  1. 2017-12-26 17:06:15.971564: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
  2. 2017-12-26 17:06:15.971732: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
  3. 2017-12-26 17:06:15.971741: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX2 instructions, but these are available on your machine and could speed up CPU computations.
  4. 2017-12-26 17:06:15.971747: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use FMA instructions, but these are available on your machine and could speed up CPU computations.
  5. Epoch 1/100 Batch 50/77 - Training Loss: 2.885 - Validation loss: 2.884
  6. Epoch 2/100 Batch 50/77 - Training Loss: 2.137 - Validation loss: 2.146
  7. Epoch 3/100 Batch 50/77 - Training Loss: 1.783 - Validation loss: 1.801
  8. Epoch 4/100 Batch 50/77 - Training Loss: 1.528 - Validation loss: 1.537
  9. Epoch 5/100 Batch 50/77 - Training Loss: 1.320 - Validation loss: 1.307
  10. Epoch 6/100 Batch 50/77 - Training Loss: 1.123 - Validation loss: 1.111
  11. Epoch 7/100 Batch 50/77 - Training Loss: 0.961 - Validation loss: 0.952
  12. Epoch 8/100 Batch 50/77 - Training Loss: 0.827 - Validation loss: 0.826
  13. Epoch 9/100 Batch 50/77 - Training Loss: 0.726 - Validation loss: 0.733
  14. Epoch 10/100 Batch 50/77 - Training Loss: 0.643 - Validation loss: 0.642
  15. Epoch 11/100 Batch 50/77 - Training Loss: 0.576 - Validation loss: 0.563
  16. Epoch 12/100 Batch 50/77 - Training Loss: 0.518 - Validation loss: 0.492
  17. Epoch 13/100 Batch 50/77 - Training Loss: 0.460 - Validation loss: 0.435
  18. Epoch 14/100 Batch 50/77 - Training Loss: 0.401 - Validation loss: 0.382
  19. Epoch 15/100 Batch 50/77 - Training Loss: 0.349 - Validation loss: 0.330
  20. Epoch 16/100 Batch 50/77 - Training Loss: 0.303 - Validation loss: 0.282
  21. Epoch 17/100 Batch 50/77 - Training Loss: 0.261 - Validation loss: 0.240
  22. Epoch 18/100 Batch 50/77 - Training Loss: 0.226 - Validation loss: 0.203




预测:

   
   
  1. from data_utils import extract_character_vocab,read_file
  2. import tensorflow as tf
  3. batch_size=128
  4. source_path="data/letters_source.txt"
  5. target_path="data/letters_target.txt"
  6. source_data, target_data=read_file(source_path,target_path)
  7. target_int_to_letter, target_letter_to_int = extract_character_vocab(target_data)
  8. source_int_to_letter, source_letter_to_int = extract_character_vocab(source_data)
  9. def source_to_seq(text):
  10. '''
  11. 对源数据进行转换
  12. '''
  13. sequence_length = 7
  14. return [source_letter_to_int.get(word, source_letter_to_int['<UNK>']) for word in text] + [source_letter_to_int['<PAD>']]*(sequence_length-len(text))
  15. input_word = 'xxegcbj'
  16. text = source_to_seq(input_word)
  17. checkpoint = "modelsave/trained_model.ckpt"
  18. loaded_graph = tf.Graph()
  19. with tf.Session(graph=loaded_graph) as sess:
  20. # 加载模型
  21. loader = tf.train.import_meta_graph(checkpoint + '.meta')
  22. loader.restore(sess, checkpoint)
  23. input_data = loaded_graph.get_tensor_by_name('inputs:0')
  24. logits = loaded_graph.get_tensor_by_name('predictions:0')
  25. source_sequence_length = loaded_graph.get_tensor_by_name('source_sequence_length:0')
  26. target_sequence_length = loaded_graph.get_tensor_by_name('target_sequence_length:0')
  27. answer_logits = sess.run(logits, {input_data: [text] * batch_size,
  28. target_sequence_length: [len(input_word)] * batch_size,
  29. source_sequence_length: [len(input_word)] * batch_size})[0]
  30. print("answer_logits is :",answer_logits)
  31. pad = source_letter_to_int["<PAD>"]
  32. print('原始输入:', input_word)
  33. print('\nSource')
  34. print(' Word 编号: {}'.format([i for i in text]))
  35. print(' Input Words: {}'.format(" ".join([source_int_to_letter[i] for i in text])))
  36. print('\nTarget')
  37. print(' Word 编号: {}'.format([i for i in answer_logits if i != pad]))
  38. print(' Response Words: {}'.format(" ".join([target_int_to_letter[i] for i in answer_logits if i != pad])))


评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值