将文章字母编码
import time
from collections import namedtuple
import numpy as np
import tensorflow as tf
with open('anna.txt', 'r') as f:
text=f.read()
vocab = sorted(set(text))#set将文章中的所有不同字符取出,然后sorted排序
vocab_to_int = {c: i for i, c in enumerate(vocab)}#排好序的字符列表进行字典索引
int_to_vocab = dict(enumerate(vocab))#与上字典相反,索引号为键,字符为值
encoded = np.array([vocab_to_int[c] for c in text], dtype=np.int32)#把text中所有字符进行数字编码
将数据生成mini-batches
定义函数,读入文章,sequence长度、step长度为超参数
def get_batches(arr, n_seqs, n_steps):
# 用sequence和step计算batch大小,得出batch个数,最后不够一个batch的扔掉
characters_per_batch = n_seqs * n_steps
n_batches = len(arr)//characters_per_batch
arr = arr[:n_batches * characters_per_batch]
# 重新reshape为sequence行,列数自动生成(-1)
arr = arr.reshape((n_seqs, -1))
# 生成样本特征batch及目标值batch(目标值为样本值的下一个字母)
for n in range(0, arr.shape[1], n_steps):
x = arr[:, n:n+n_steps]
y = np.zeros_like(x)
# 目标值往下滚动一个字母,目标batch最后一列可设置为样本特征batch的第一列,不会影响精度
y[:, :-1], y[:,-1] = x[:, 1:], x[:, 0]
# x,y为生成器(generater)
yield x, y
创建输入层
创建输入、目标值占位符,以及keep_prob的占位符(Dropout层用到)
def build_inputs(batch_size, num_steps):
'''batch_size是每个batch中sequence的长度(batch行数)
num_steps是batch列数
'''
inputs = tf.placeholder(tf.int32, [batch_size, num_steps], name='inputs')
targets = tf.placeholder(tf.int32, [batch_size, num_steps], name='targets')
keep_prob = tf.placeholder(tf.float32, name='keep_prob')
return inputs, targets, keep_prob
创建LSTM单元
- 创建隐藏层中的LSTM单元
tf.contrib.rnn.BasicLSTMCell(num_units)
在cell外包裹上Dropout
tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
为什么这么做可以看一下Wojciech Zaremba的论文:Recurrent Neural Netwo