本文主要利用lstm实现英文摘要的自动生成,他的主要步骤如下:
(1) 首先是我们把训练数据的每个单词对应一个数值,构建一个word to int的字典,得到单词和数值的一个一一对应的关系后,我们就可以把整个训练数据集把所有单词转化为数值的序列,也就是数值型的数列,
(2) 训练数据处理好了以后,我们就应该构造batch,建立一个生成batch_size的生成器
(3) 然后开始构建lstm网络,同时开始训练
(4) 训练完了以后就可以开始给定初始单词,让模型生成一个序列,再把序列转化为对应的单词
这个方法适用很多,比如生成周杰伦的歌词,自动生成古诗,自动生成剧本等等都是可以的,方法其实都是一样。
接下来,我们就开始写代码:
导入相关库:
import tensorflow as tf
from tensorflow.contrib import rnn,seq2seq
import numpy as np
数据读取:
with open(file,'r') as f:
txt = f.read()
txt = txt.lower() # 将所有单词都变为小写,这样可以使得不同单词的大小写同一
构建标点符号的字典
tokenize_dict = {}
tokenize_dict['!'] = "Exclamation_Mark"
tokenize_dict['.'] = "PERIOD"
tokenize_dict[','] = "COMMA"
tokenize_dict['"'] = "QUOTATION_MARK"
tokenize_dict[','] = "QUOTATION_MARK"
tokenize_dict[';'] = "SEMICOLON"
tokenize_dict['?'] = "QUESTION_MARK"
tokenize_dict['('] = "LEFT_PARENTHESES"
tokenize_dict[')'] = "RIGHT_PARENTHESES"
tokenize_dict['--'] = "DASH"
tokenize_dict['\n'] = "RETURN"
替换训练数据中的标点
for key,value in tokenize_dict.items():
txt = txt.replace(key,' {} '.format(value)) # 此处使用format是为了保证替换后的标点和前后单词留出一个空格
然后开始构建单词和数值型序列的映射关系
text = txt.split()
words = set(text)
vocab_size = len(words)
# 以下是将词转化为对应的数字
word_to_int = {word:num for num,word in enumerate(words)}
int_to_word = {value:key for key,value in word_to_int.items()}
# 这一步就是将我们的输入序列,转化为对应的数值型的序列
int_word_data = [word_to_int[word] for word in text]
然后开始构建生成器,生成batch_size
def get_batch(int_word_data,batch_size,len_seq,n_batch):
# 该函数是为了构建batch_size
int_word_data = int_word_data[:n_batch*batch_size*len_seq]
int_word_data_y = int_word_data[1:]
int_word_data_y.append(int_word_data[0])
int_word_data = np.array(int_word_data).reshape([-1,n_batch*len_seq])
int_word_data_y = np.array(int_word_data_y).reshape([-1,n_batch*len_seq])
for i in range(n_batch):
batch_x = np.zeros([batch_size,len_seq])
batch_y = np.zeros([batch_size,len_seq])
for j in range(batch_size):
batch_x[j,:] = int_word_data[j,i*len_seq:(i+1)*len_seq]
batch_y[j,:] = int_word_data_y[j,i*len_seq:(i+1)*len_seq]
yield batch_x,batch_y
然后开始构建rnn模型
def init_rnn(hidden_size,layers,batch_size):
# layers 表示lstm的层数
# hidden_size lstm里面隐藏的大小,这个和记忆有关
cells = []
for _ in range(layers):
cells.append(rnn.BasicLSTMCell(hidden_size))
cell = rnn.MultiRNNCell(cells,state_is_tuple=True)
# 以下是lstm的初始化,初始时,设置全部为0
in_state = cell.zero_state(batch_size,tf.float32)
# 这一步是给init_state取了个名字
in_state = tf.identity(in_state,'in_state')
return cell , in_state
def bulid_rnn(hidden_size,layers,inputs,vocab_size,embed_dim,batch_size):
# word2vec 的维度大小
input_shape = tf.shape(inputs)
cell,state = init_rnn(hidden_size,layers,input_shape[0])
# 此处使用词向量编码
embed = tf.contrib.layers.embed_sequence(inputs,vocab_size,embed_dim)
output,out_state = tf.nn.dynamic_rnn(cell=cell,inputs=embed,dtype=tf.float32)
out_state = tf.identity(out_state,name='out_state')
# 以下是一个全连接层,输出大小为vocab_size,因为我们总共有vocab_size个单词,所以相当于需要得到每个词输出的概率,
#就是softmax需要用到
output = tf.contrib.layers.fully_connected(output,vocab_size,activation_fn=None)
probability = tf.nn.softmax(output,name='probability')
return output,out_state
定义我们需要用到的参数
# 参数的定义
learing_rate = 0.01
hidden_size = 512
layers = 1
embed_dim = 200
batch_size = 64
len_seq = 20
vocab_size = len(words)
num_epochs = 10
n_batch = len(int_word_data) // (batch_size*len_seq)
epochs = 100
然后定义loss,optimizer等
# 定义输入,input是我们输入的数据,y是相当于label
inputs = tf.placeholder(tf.int32,[None,None],name='inputs')
y = tf.placeholder(tf.int32,[None,None],name='target')
output,state = bulid_rnn(hidden_size,layers,inputs,vocab_size,embed_dim,batch_size)
# 这个loss其实就是交叉熵损失,只是他把计算softmax和计算交叉熵方一起了,然后同时计算了一个tf.reduce_sum
loss = seq2seq.sequence_loss(output,y,tf.ones([batch_size,len_seq]))
accuracy = tf.reduce_sum(tf.cast(tf.equal(tf.argmax(tf.nn.softmax(output),2),\
tf.cast(y,tf.int64)),tf.float32))
optimizer = tf.train.AdamOptimizer(learing_rate).minimize(loss)
开始训练模型
with tf.Session() as sess:
writer = tf.summary.FileWriter('graphs/seq')
sess.run(tf.global_variables_initializer())
for epoch in range(epochs):
total_loss = 0
train_acc = 0
for batch_x,batch_y in get_batch(int_word_data,batch_size,len_seq,n_batch):
tmp_loss,tmp_acc,_ = sess.run([loss,accuracy,optimizer],feed_dict={inputs:batch_x,y:batch_y})
total_loss += tmp_loss
#train_acc += tmp_acc
if epoch % 10 == 0:
print('Epoch {}/{} train_loss {:.3f}'.format(epoch,epochs,total_loss/n_batch))
saver = tf.train.Saver()
saver.save(sess, './checkpoints/arvix/')
print('Model Trained and Saved')
writer.close()
模型训练完成以后,我们开始预测结果
定义我们的生成序列的函数
def get_state(graph):
# 拿到模型的输入,因为我们生成摘要的时候,也是需要给模型一个输入
inputs = graph.get_tensor_by_name('inputs:0')
# 拿到训练结束时的输出的状态,当作当前状态的输入,因为在rnn里面,他是有记忆的,他的输入和上一次有关,同时和当前
#的输入也有关,是二者的一个综合
in_state = graph.get_tensor_by_name('in_state:0')
# 拿到输出状态,作为下一次的输入状态,
out_state = graph.get_tensor_by_name('out_state:0')
# 拿到当前输出的每个单词的概率,这个是为了我们预测做准备
probability = graph.get_tensor_by_name('probability:0')
return inputs,in_state,out_state,probability
def get_word(probability,int_to_word):
# np.random.choice(a,size,p)
# 其中a表示一个数组,也就是随机数在其中选择生成,如何是一个数比如6,8等,则默认是np.arange(a)里面选择
# size表示生成的大小,比如可以8行6列等等
# p表示每一个可以选择的值的概率,必须加起来等于1
# 使用int_to_word是为了将数值对应的单词输出
int_w = np.random.choice(probability.shape[2],1,p=probability[0,0,:])[0]
return int_to_word[int_w]
设置我们初始的单词和生成序列的长度
# 设置生成句子的长度
gen_len = 300
begin_word = 'deep'
开始预测
# 创建一个Graph对象
graph = tf.Graph()
with tf.Session(graph=graph) as sess:
# tf.train.import_meta_graph 表示不重复定义计算图
load_meta = tf.train.import_meta_graph('./checkpoints/arvix/.meta')
load_meta.restore(sess,'./checkpoints/arvix/')
inputs,in_state,out_state,probability = get_state(graph)
gen_sentences = [begin_word]
pre_state = sess.run(in_state,feed_dict={inputs:np.array([[5]])})
for n in range(gen_len):
input_s = [[word_to_int[word] for word in gen_sentences[-gen_len:]]]
seq_len = len(input_s[0])
probability_,pre_stat= sess.run([probability,out_state],feed_dict={inputs:input_s,in_state:pre_state})
gen_word = get_word(probability_,int_to_word)
gen_sentences.append(gen_word)
gen_sentences = ' '.join(gen_sentences)
for key,value in tokenize_dict.items():
gen_sentences = gen_sentences.replace(value,key+' ')
print(gen_sentences)
预测结果如下:
INFO:tensorflow:Restoring parameters from ./checkpoints/arvix/
deep stacked rnn networks networks learning stacked rnns learning learning learning learning learning learning learning networks learning learning analysis rnns belief rnn neural neural neural feedforward rnn neural neural learning networks learning feedforward rnn network networks learning nets neural networks neural learning rnns stacked network learning rnns stacked networks autoencoder rnns learning network learning belief stacked autoencoder neural networks networks rnns autoencoder learning networks rnns network networks autoencoder stacked look learning feedforward autoencoder stacked neural rnns learning multi-layer rnn nets learning multi-layer learning learning rnn networks networks networks learning rnn autoencoder network stacked rnns learning models models neural learning clustering learning learning learning look networks learning learning learning networks network neural analysis models belief stacked hierarchical rnn learning rnn learning neural stacked learning networks neural rnns neural autoencoder neural learning networks models neural learning neural belief learning learning learning neural network neural look neural stacked autoencoder learning stacked learning neural learning networks analysis networks belief networks belief network feedforward network multi-layer look rnns models dual-stream neural neural networks learning belief dual-stream learning belief stacked learning learning long learning networks learning learning networks rnn networks learning look multi-layer learning look neural learning network dual-stream neural networks stacked learning stacked neural multi-layer autoencoder neural recurrent networks multi-layer learning learning learning learning rnns learning learning network neural belief networks neural learning learning learning models autoencoder learning recurrent learning stacked learning neural learning stacked neural ones neural networks rendering learning rnn learning multi-layer learning rnns learning belief learning learning networks analysis stacked look learning learning networks learning rnn learning learning networks learning analysis network clustering networks network learning autoencoder stacked learning rnn network dual-stream learning networks learning network stacked learning stacked networks look learning learning learning ones neural neural stacked learning learning learning network look rnns autoencoder neural learning neural learning learning network multi-layer stacked
这就是利用rnn做序列的预测的方法。