使用TensorFlow调用PTB文本数据集
TensorFlow已经封装好使用PTB数据集的库函数,调用即可。以下代码为获取PTB数据集中的训练集、验证集以及测试集。新版TensorFlow中不能直接import reader,想要快速调用reader模块,可见在tensorflow中直接import reader。PTB数据集可在此下载,代码中只需data文件夹下内容。
import tensorflow as tf
import reader
# 存放数据的路径
DATA_PATH = "\data"
train_data, valid_data, test_data, _ = reader.ptb_raw_data(DATA_PATH)
print(len(train_data))
print(train_data[:50])
输出结果为:
929589
[9970, 9971, 9972, 9974, 9975, 9976, 9980, 9981, 9982, 9983, 9984, 9986, 9987, 9988, 9989, 9991, 9992, 9993, 9994, 9995, 9996, 9997, 9998, 9999, 2, 9256, 1, 3, 72, 393, 33, 2133, 0, 146, 19, 6, 9207, 276, 407, 3, 2, 23, 1, 13, 141, 4, 1, 5465, 0, 3081]
注:PTB数据集中,每一句话结束的标志即为字符2
在使用PTB数据集训练RNN时,句子需要被截断,TensorFlow也提供了函数可以直接截断并且组成batch,代码如下:
# 将训练数据按5个词截断,4组为一个batch
# x为截断数据,y为x对应的序列输出,即每一个单词对应的输出是该单词后面的一个单词
x, y = reader.ptb_producer(train_data, 4, 5)
# 创建会话,读取一个batch
with tf.Session() as sess:
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
x, y = sess.run([x, y])
print(x)
print(y)
coord.request_stop()
coord.join(threads)
输出结果为:
[[9970 9971 9972 9974 9975]
[ 332 7147 328 1452 8595]
[1969 0 98 89 2254]
[ 3 3 2 14 24]]
[[9971 9972 9974 9975 9976]
[7147 328 1452 8595 59]
[ 0 98 89 2254 0]
[ 3 2 14 24 198]]
搭建基于LSTM的PTB语言模型
在TensorFlow框架中,使用两层LSTM结构搭建出PTB自然语言模型,代码如下:
import numpy as np
import tensorflow as tf
import reader
# 存放数据的路径
DATA_PATH = "/Users/gaoyue/文档/Program/tensorflow_google/chapter8/simple-examples/data"
hidden_size = 200 # 隐藏层,用于记忆和储存过去状态的节点个数
num_layers = 2 # LSTM结构的层数为2层,前一层的LSTM的输出作为后一层的输入
vocab_size = 10000 # 词典大小,可以存储10000个
learning_rate = 1.0 # 初始学习率
train_batch_size = 20 # 训练batch大小
train_num_step = 35 # 一个训练序列长度
num_epoch = 2
keep_prob = 0.5 # 节点保存50%
max_grad_norm = 5 # 用于控制梯度膨胀(误差对输入层的偏导趋于无穷大)
# 在测试时不用限制序列长度
eval_batch_size = 1
eval_num_step = 1
class PTBModel(object): # 类要使用camelcase格式
def __init__(self, is_training, batch_size, num_steps): # 初始化属性
self.batch_size = batch_size
self.num_steps = num_steps
# 定义输入层,输入层维度为batch_size * num_steps
self.input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
# 定义正确输出
self.targets = tf.placeholder(tf.int32, [batch_size, num_steps])
# 定义lstm结构
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)
if is_training:
# 使用dropout
lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)
cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers) # 实现多层LSTM
# 将lstm中的状态初始化为全0数组,BasicLSTMCell提供了zero_state来生成全0数组
# batch_size给出了一个batch的大小
self.initial_state = cell.zero_state(batch_size, tf.float32)
# 生成单词向量,单词总数为10000,单词向量维度为hidden_size200,所以词嵌入总数embedding为
embedding = tf.get_variable("embedding", [vocab_size, hidden_size])
# lstm输入单词为batch_size*num_steps个单词,则输入维度为batch_size*num_steps*hidden_size
# embedding_lookup为将input_data作为索引来搜索embedding中内容,若input_data为[0,0],则输出为embedding中第0个词向量
inputs = tf.nn.embedding_lookup(embedding, self.input_data)
# 在训练时用dropout
if is_training:
inputs = tf.nn.dropout(inputs, keep_prob)
# 输出层
outputs = []
# state为不同batch中的LSTM状态,初始状态为0
state = self.initial_state
with tf.variable_scope("RNN"):
for time_step in range(num_steps):
if time_step > 0:
# variables复用
tf.get_variable_scope().reuse_variables()
# 将当前输入进lstm中,inputs输入维度为batch_size*num_steps*hidden_size
cell_output, state = cell(inputs[:, time_step, :], state)
# 输出队列
outputs.append(cell_output)
# 输出队列为[batch, hidden_size*num_steps],在改成[batch*num_steps, hidden_size]
# [-1, hidden_size]中-1表示任意数量的样本
output = tf.reshape(tf.concat(outputs, 1), [-1, hidden_size])
# lstm的输出经过全连接层得到最后结果,最后结果的维度是10000,softmax后表明下一个单词的位置(概率大小)
weight = tf.get_variable("weight", [hidden_size, vocab_size])
bias = tf.get_variable("bias", [vocab_size])
logits = tf.matmul(output, weight) + bias # 预测的结果
# 交叉熵损失,tensorflow中有sequence_loss_by_example来计算一个序列的交叉熵损失和
# tf.reshape将正确结果转换为一维的,tf.ones建立损失权重,所有权重都为1,不同时刻不同batch的权重是一样的
loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [tf.reshape(self.targets, [-1])],
[tf.ones([batch_size * num_steps], dtype=tf.float32)])
# 每个batch的平均损失,reduce_sum计算loss总和
self.cost = tf.reduce_sum(loss)/batch_size
self.final_state = state
# 在训练时定义反向传播
if not is_training:
return
trainable_variables = tf.trainable_variables()
# 使用clip_by_global_norm控制梯度大小,避免梯度膨胀
grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, trainable_variables), max_grad_norm)
# 梯度下降优化
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
# 训练步骤,apply_gradients将计算出的梯度应用到变量上
# zip将grads和trainable_variables中每一个打包成元组
# a = [1,2,3], b = [4,5,6], zip(a, b): [(1, 4), (2, 5), (3, 6)]
self.train_op = optimizer.apply_gradients(zip(grads, trainable_variables))
# 模型训练,给出模型的复杂度
def run_epoch(session, model, data, train_op, output_log, epoch_size):
# perplexity(复杂度)是用来评价一个语言模型预测一个样本是否很好的标准。复杂度越低,代表模型的预测性能越好
total_costs = 0.0
iters = 0
state = session.run(model.initial_state)
# 训练一个epoch
for step in range(epoch_size):
x, y = session.run(data)
# cost是交叉熵损失,即下一个单词为给定单词的概率
cost, state, _ = session.run([model.cost, model.final_state, train_op],
{model.input_data: x, model.targets: y, model.initial_state: state})
# 将所有batch、时刻的损失相加
total_costs += cost
# 所有epoch总输出单词数
iters += model.num_steps
if output_log and step % 100 == 0:
print("After %d steps, perplexity is %.3f" % (step, np.exp(total_costs / iters)))
# 返回语言模型的perplexity值
return np.exp(total_costs / iters)
def main():
# 获取数据
train_data, valid_data, test_data, _ = reader.ptb_raw_data(DATA_PATH)
# 计算一个epoch需要训练的次数
train_data_len = len(train_data)
train_epoch_size = (train_data_len - 1)
valid_data_len = len(valid_data)
valid_epoch_size = (valid_data_len - 1)
test_data_len = len(test_data)
test_epoch_size = (test_data_len - 1)
# 定义初始化函数
initializer = tf.random_uniform_initializer(-0.05, 0.05)
# 定义语言训练模型
with tf.variable_scope("language_model", reuse=None, initializer=initializer):
train_model = PTBModel(True, train_batch_size, train_num_step)
# 定义语言测试模型
with tf.variable_scope("language_model", reuse=True, initializer=initializer):
eval_model = PTBModel(False, eval_batch_size, eval_num_step)
# 训练模型
with tf.Session() as session:
tf.global_variables_initializer().run()
train_queue = reader.ptb_producer(train_data, train_model.batch_size, train_model.num_steps)
eval_queue = reader.ptb_producer(valid_data, eval_model.batch_size, eval_model.num_steps)
test_queue = reader.ptb_producer(test_data, eval_model.batch_size, eval_model.num_steps)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=session, coord=coord)
for i in range(num_epoch):
print("iteration: %d" % (i + 1))
run_epoch(session, train_model, train_queue, train_model.train_op, True, train_epoch_size)
# 传入了tf.no_op表示不进行优化
valid_perplexity = run_epoch(session, eval_model, eval_queue, tf.no_op(), False, valid_epoch_size)
print("Epoch: %d Validation Perplexity: %.3f" % (i + 1, valid_perplexity))
test_perplexity = run_epoch(session, eval_model, test_queue, tf.no_op(), False, test_epoch_size)
print("Test Perplexity: %.3f" % test_perplexity)
coord.request_stop()
coord.join(threads)
if __name__ == "__main__":
main()
输出结果为:
iteration: 1
After 0 steps, perplexity is 9987.416
After 100 steps, perplexity is 1314.799
After 200 steps, perplexity is 996.104
.
.
.
After 32200 steps, perplexity is 169.283
After 32300 steps, perplexity is 169.194
After 32400 steps, perplexity is 169.147
.
.
.
注:perplexity值代表从perplexity个单词中选择下一个词,若 perplexity = 169,即句子中的下一个词将会在169个词中选择,通过网络优化,可以进一步降低perplexity值。