1 循环神经网络简介
循环神经网络为了刻画一个序列当前的输出与之前信息的隐藏层之间存在连接,隐藏层的输入来自于输入层的数据以及上一时刻隐藏层的输出。这样的结构使得循环神经网络会对之前的信息有所记忆,同时利用之前的信息影响后面节点的输出,所以循环神经网络不需要分别学习句子每个位置的所有语言特征规则。
如图展示了循环神经网络的一种典型结构,相比之前的网络,循环神经网络更加注重“时刻”的概念。途中 O 表示循环神经网络在时刻 t 给出的一个输出, x 表示在时刻 t 循环神经网络的输入, H 是循环神经网络的主体结构,循环的过程就是 H 的不断被执行。
在 t 时刻,H 会读取输入层的 输入 x, 并输出一个值 o ,同时 H 的形态值会从当前步传递到下一步。也就是说, H 的输入除了来自输入层的输入数据 x ,还来自于上一时刻 H 的输出。
2 循环神经网络的前向传播程序设计
卷积神经网络中也存在着参数共享,这和循环神经网络是类似的。循环神经网络可以看作是某一神经网络结构 在时间序列伤被执行多次的结果。使用循环神经网络解决实际问题时,关键就是如何合理地设计循环体的网络结构。
以下代码使用numpy实现了上述简单循环神经网络前向传播的计算过程:
import numpy as np
#定义相关参数,init_state是输入到t1的t0时刻输出状态
x = [0.8, 0.1]
init_state = [0.3, 0.6]
W = np.asarray([[0.2, 0.4], [0.7, 0.3]])
U = np.asarray([0.8, 0.1])
b_h = np.asarray([0.2, 0.1])
V = np.asarray([[0.5], [0.5]])
b_o = 0.1
#执行两轮循环,模拟前向传播过程
for i in range(len(x)):
#numpy的dot()函数用于矩阵相乘,函数原型为dot(a,b,out)
before_activation = np.dot(init_state, W)+x[i]*U+b_h
#numpy也提供了tanh()函数,用于实现双曲正切函数的计算
state = np.tanh(before_activation)
#本时刻的状态作为下一时刻的初始状态
init_state = state
#计算本时刻的输出
final_output = np.dot(state, V) + b_o
#打印t1和t2时刻的状态和输出信息
print ("t %s state: %s" %(i+1,state))
print ("t%s output: %s\n" %(i+1,final_output))
3 LSTM实现
import tensorflow as tf
import numpy as np
import time
import reader
class PTBModel(object):
def __init__(self, is_training, config, data, name=None):
self.batch_size = batch_size = config.batch_size
self.num_steps = num_steps = config.num_steps
self.epoch_size = ((len(data) // batch_size - 1) // num_steps)
self.input_data, self.targets = reader.ptb_producer(data, batch_size, num_steps, name=name)
self.size = config.hidden_size
self.vocab_size = config.vocab_size
#以LSTM结构作为循环体结构,并且在训练使用 dropout
lstm_cell = tf.contrib.rnn.BasicLSTMCell(self.size,forget_bias=0.0,state_is_tuple=True)
if is_training and config.keep_prob < 1:
lstm_cell = tf.contrib.rnn.DropoutWrapper(lstm_cell,output_keep_prob=config.keep_prob)
#堆叠LSTM单元LSTM_CELL
cell = tf.contrib.rnn.MultiRNNCell([lstm_cell for _ in range(config.num_layers)], state_is_tuple=True)
#初始化最初的状态,即全0的向量
self.initial_state = cell.zero_state(batch_size, tf.float32)
#将单词id转换为单词向量,这里embedding为embedding_lookup()函数的维度信息
#单词总数通过vocab_size传入,每个单词向量的维度是size(参数hidden_size的值)
#这样便得出embedding参数的维度
embedding = tf.get_variable("embedding",[self.vocab_size, self.size], dtype=tf.float32)
#通过embedding_lookup()函数将原本batch_size x num_steps个单词id转换为
#单词向量,转换后的输入层维度为batch_size x num_steps x size
inputs = tf.nn.embedding_lookup(embedding, self.input_data)
#在训练模式下,会对inputs添加dropout
if is_training and config.keep_prob < 1:
inputs = tf.nn.dropout(inputs, config.keep_prob)
#定义输出列表,在这里对不同时刻LSTM结构的输出进行汇总,之后会通过一个全连层得到最终输出
outputs = []
#定义 state 存储不同batch中LSTM的状态,并初始化为0
state = self.initial_state
with tf.variable_scope("RNN"):
for time_step in range(num_steps):
if time_step > 0:
tf.get_variable_scope().reuse_variables()
#从输入数据获取当前时刻的输入并传入LSTM结构
cell_output, state = cell(inputs[:, time_step,:],state)
#使用append()函数执行插入操作
outputs.append(cell_output)
output = tf.reshape(tf.concat(outputs,1),[-1,self.size])
weight = tf.get_variable("softmax_w",[self.size,self.vocab_size],dtype=tf.float32)
bias = tf.get_variable("softmax_b",[self.vocab_size],dtype=tf.float32)
logits = tf.matmul(output, weight) + bias
loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits],[tf.reshape(self.targets, [-1])],
[tf.ones([batch_size*num_steps],dtype=tf.float32)])
#计算每个batch的平均损失
self.cost = cost = tf.reduce_sum(loss) / batch_size
self.final_state = state
#只在训练时定义反向传播操作
if not is_training:
return
self.learning_rate = tf.Variable(0.0, trainable=False)
trainable_variables = tf.trainable_variables()
#计算self.cost关于trainable_variables的梯度
gradients = tf.gradients(cost, trainable_variables)
#通过clip_by_global_norm()函数控制梯度大小,以免发生梯度彭胀
clipped_grads, _ = tf.clip_by_global_norm(gradients, config.max_grad_norm)
#使用随机梯度下降优化器并定义训练的步骤
SGDOptimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
self.train_op = SGDOptimizer.apply_gradients(zip(clipped_grads, trainable_variables),
global_step = tf.contrib.framework.get_or_create_global_step())
self.new_learning_rate = tf.placeholder(tf.float32,shape=[],name="new_learning_rate")
self.learning_rate_update = tf.assign(self.learning_rate,self.new_learning_rate)
#定义学习率分配函数,该函数辉在定义会话时用到
def assign_lr(self,session,lr_value):
session.run(self.learning_rate_update, feed_dict={self.new_learning_rate:lr_value})
class Config(object):
init_scale = 0.1
learning_rate = 1.0
max_grad_norm = 5
num_layers = 2
num_steps = 20
hidden_size = 200
max_epoch = 4
total_epoch = 13
keep_prob = 1.0
lr_decay = 0.5
batch_size = 20
vocab_size = 10000
def run_epoch(session,model,train_op=None,output_log=False):
start_time = time.time()
costs = 0.0
iters = 0
state = session.run(model.initial_state)
fetches = {
"cost" : model.cost,
"final_state" : model.final_state,
}
if train_op is not None:
fetches["train_op"] = train_op
for step in range(model.epoch_size):
feed_dict={}
for i, (c, h) in enumerate(model.initial_state):
feed_dict[c] = state[i].c
feed_dict[h] = state[i].h
result = session.run(fetches,feed_dict)
cost = result["cost"]
state = result["final_state"]
costs += cost
iters += model.num_steps
if output_log and step % (model.epoch_size//10) == 10:
print ("step %.3f perplexity: %.3f speed: %0.f words/sec" %(step, np.exp(costs/iters),
iters*model.batch_size/(time.time() - start_time)))
return np.exp(costs/iters)
train_data, valid_date, test_data, _ = reader.ptb_raw_data("./simple-examples/data/")
config = Config()
eval_config = Config()
eval_config.batch_size = 1
eval_config.num_steps = 1
with tf.Graph().as_default():
initializer = tf.random_uniform_initializer(-config.init_scale,config.init_scale)
#定义用于训练的循环神经网络模型
with tf.name_scope("Train"):
with tf.variable_scope("Model",reuse=None,initializer=initializer):
Model_train = PTBModel(is_training=True,config=config,data=train_data,name="TrainModel")
#定义用于验证的循环神经网络模型
with tf.name_scope("Valid"):
with tf.variable_scope("Model",reuse=True,initializer=initializer):
Mode_valid = PTBModel(is_training=False,config=config,data=valid_date,name="ValidModel")
#定义用于测试的循环神经网络模型
with tf.name_scope("Test"):
with tf.variable_scope("Model",reuse=True,initializer=initializer):
Mode_test = PTBModel(is_training=False,config=eval_config,data=test_data,name="TestModel")
sv = tf.train.Supervisor()
with sv.managed_session() as session:
for i in range(config.total_epoch):
lr_decay = config.lr_decay ** max(i+1 - config.max_epoch, 0.0)
Model_train.assign_lr(session, config.learning_rate*lr_decay)
print ("Epoch:%d Learning rate: %.3f"%(i+1,session.run(Model_train.learning_rate)))
#在所有训练数据上训练循环神经网络模型
train_perplexity = run_epoch(session,Model_train,train_op=Model_train.train_op,output_log=True)
print ("Epoch:%d Train Perplexity: %.3f" % (i + 1, train_perplexity))
valid_perplexity = run_epoch(session,Mode_valid)
print ("Epoch:%d Train Perplexity: %.3f" % (i + 1, valid_perplexity))
test_perplexity = run_epoch(session,Mode_test)
print ("Epoch:%d Train Perplexity: %.3f" % (i + 1, test_perplexity))