# Tensorflow[基础篇]——LSTM的理解与实现




# lstm理论知识

LSTM隐含层

（鄙人没有对公式进行证明，所以在此猜测一下。）LSTM是分为state和h_t两个作为下一个元组的输入内容。

# 实战代码

• ## config.py


# config.py
# -*-coding:utf-8-*-#
import string

class ModelConfig(object):
def __init__(self):
self.num_unrollings = 10 # 每条数据的字符串长度
self.batch_size = 64 # 每一批数据的个数
self.vocabulary_size = len(string.ascii_lowercase) + 1 # 定义出现字符串的个数(一共有26个英文字母和一个空格)
self.summary_frequency = 100 # 生成样本的频率
self.num_steps = 7001 # 训练步数
self.num_nodes = 64 # 隐含层个数

config = ModelConfig()


• ## handle_data.py


# -*-coding:utf-8-*-#
import tensorflow as tf
import string
import zipfile
import numpy as np

first_letter = ord(string.ascii_lowercase[0])

def __init__(self, valid_size=1000):
self.valid_text = self.text[:valid_size]
self.train_text = self.text[valid_size:]

with zipfile.ZipFile(filename) as f:
# 获取当中的一个文件
name = f.namelist()[0]
print('file name : %s ' % name)
return data

def char2id(char):
# 将字母转换成id
if char in string.ascii_lowercase:
return ord(char) - first_letter + 1
elif char == ' ':
return 0
else:
print("Unexpencted character: %s " % char)
return 0

def id2char(dictid):
# 将id转换成字母
if dictid > 0:
return chr(dictid + first_letter - 1)
else:
return ' '

def characters(probabilities):
# 根据传入的概率向量得到相应的词
return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
# 用于测试得到的batches是否符合原来的字符组合
s = [''] * batches[0].shape[0]
for b in batches:
s = [''.join(x) for x in zip(s, characters(b))]
return s


• ## BatchGenerator.py


# -*-coding:utf-8-*-#
import numpy as np
from handleData import char2id
from config import config

class BatchGenerator(object):
def __init__(self, text, batch_size, num_unrollings):
self._text = text
self._text_size = len(text)
self._batch_size = batch_size
self._num_unrollings = num_unrollings
# 每个串之间的间距
segment = self._text_size // self._batch_size
# 记录每个串当前的位置
self._cursor =[ offset * segment for offset in range(self._batch_size)]
self._last_batch = self._next_batch()

def _next_batch(self):
"""
从当前数据的游标位置生成单一批数据，一个batch的大小为(batch, 27)
"""
batch = np.zeros(shape=(self._batch_size, config.vocabulary_size), dtype=np.float)
for b in range(self._batch_size):
# 生成one-hot向量
batch[b, char2id(self._text[self._cursor[b]])] = 1.0
self._cursor[b] = (self._cursor[b] + 1) % self._text_size
return batch

def next(self):
# 因为这里加入了上一批数据的最后一个字符，所以当前这批
# 数据每串长度为num_unrollings + 1
batches = [self._last_batch]
for step in range(self._num_unrollings):
batches.append(self._next_batch())
self._last_batch = batches[-1]
return batches



train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

print(batches2string(train_batches.next()))
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))


['ons anarchi', 'when milita', 'lleria arch', ' abbeys and', 'married urr', 'hel and ric', 'y and litur', 'ay opened f', 'tion from t', 'migration t', 'new york ot', 'he boeing s', 'e listed wi', 'eber has pr', 'o be made t', 'yer who rec', 'ore signifi', 'a fierce cr', ' two six ei', 'aristotle s', 'ity can be ', ' and intrac', 'tion of the', 'dy to pass ', 'f certain d', 'at it will ', 'e convince ', 'ent told hi', 'ampaign and', 'rver side s', 'ious texts ', 'o capitaliz', 'a duplicate', 'gh ann es d', 'ine january', 'ross zero t', 'cal theorie', 'ast instanc', ' dimensiona', 'most holy m', 't s support', 'u is still ', 'e oscillati', 'o eight sub', 'of italy la', 's the tower', 'klahoma pre', 'erprise lin', 'ws becomes ', 'et in a naz', 'the fabian ', 'etchy to re', ' sharman ne', 'ised empero', 'ting in pol', 'd neo latin', 'th risky ri', 'encyclopedi', 'fense the a', 'duating fro', 'treet grid ', 'ations more', 'appeal of d', 'si have mad']
['ists advoca', 'ary governm', 'hes nationa', 'd monasteri', 'raca prince', 'chard baer ', 'rgical lang', 'for passeng', 'the nationa', 'took place ', 'ther well k', 'seven six s', 'ith a gloss', 'robably bee', 'to recogniz', 'ceived the ', 'icant than ', 'ritic of th', 'ight in sig', 's uncaused ', ' lost as in', 'cellular ic', 'e size of t', ' him a stic', 'drugs confu', ' take to co', ' the priest', 'im to name ', 'd barred at', 'standard fo', ' such as es', 'ze on the g', 'e of the or', 'd hiver one', 'y eight mar', 'the lead ch', 'es classica', 'ce the non ', 'al analysis', 'mormons bel', 't or at lea', ' disagreed ', 'ing system ', 'btypes base', 'anguages th', 'r commissio', 'ess one nin', 'nux suse li', ' the first ', 'zi concentr', ' society ne', 'elatively s', 'etworks sha', 'or hirohito', 'litical ini', 'n most of t', 'iskerdoo ri', 'ic overview', 'air compone', 'om acnm acc', ' centerline', 'e than any ', 'devotional ', 'de such dev']
[' a']
['an']


• ## sample.py


# -*-coding:utf-8-*-#

import random
import numpy as np
from config import config

def sample_distribution(distribution):
# 随机概率分布采样
r = random.uniform(0, 1)
s = 0
for i in range(len(distribution)):
s += distribution[i]
if s >= r:
return i
return len(distribution) - 1

def sample(prediction):
# 随机采样生成one-hot向量
p = np.zeros(shape=[1, config.vocabulary_size], dtype=np.float)
p[0, sample_distribution(prediction[0])] = 1.0
return p

def random_distribution():
# 生成随机概率向量,向量大小为1*27
b = np.random.uniform(0.0, 1.0, size=[1, config.vocabulary_size])
return b / np.sum(b, 1)[:, None]

• ## lstm_model.py


# -*-coding:utf-8-*-#
import tensorflow as tf
from config import config

class LSTM_Cell(object):

def __init__(self, train_data, train_label, num_nodes=64):
with tf.variable_scope("input", initializer=tf.truncated_normal_initializer(-0.1, 0.1)) as input_layer:
self.ix, self.im, self.ib = self._generate_w_b(
x_weights_size=[config.vocabulary_size, num_nodes],
m_weights_size=[num_nodes, num_nodes],
biases_size=[1, num_nodes])
with tf.variable_scope("memory", initializer=tf.truncated_normal_initializer(-0.1, 0.1)) as update_layer:
self.cx, self.cm, self.cb = self._generate_w_b(
x_weights_size=[config.vocabulary_size, num_nodes],
m_weights_size=[num_nodes, num_nodes],
biases_size=[1, num_nodes])
with tf.variable_scope("forget", initializer=tf.truncated_normal_initializer(-0.1, 0.1)) as forget_layer:
self.fx, self.fm, self.fb = self._generate_w_b(
x_weights_size=[config.vocabulary_size, num_nodes],
m_weights_size=[num_nodes, num_nodes],
biases_size=[1, num_nodes])
with tf.variable_scope("output", initializer=tf.truncated_normal_initializer(-0.1, 0.1)) as output_layer:
self.ox, self.om, self.ob = self._generate_w_b(
x_weights_size=[config.vocabulary_size, num_nodes],
m_weights_size=[num_nodes, num_nodes],
biases_size=[1, num_nodes])

self.w = tf.Variable(tf.truncated_normal([num_nodes, config.vocabulary_size], -0.1, 0.1))
self.b = tf.Variable(tf.zeros([config.vocabulary_size]))

self.saved_output = tf.Variable(tf.zeros([config.batch_size, num_nodes]), trainable=False)
self.saved_state = tf.Variable(tf.zeros([config.batch_size, num_nodes]), trainable=False)

self.train_data = train_data
self.train_label = train_label

def _generate_w_b(self, x_weights_size, m_weights_size, biases_size):
x_w = tf.get_variable("x_weights", x_weights_size)
m_w = tf.get_variable("m_weigths", m_weights_size)
b = tf.get_variable("biases", config.batch_size, initializer=tf.constant_initializer(0.0))
return x_w, m_w, b

def _run(self, input, output, state):
forget_gate = tf.sigmoid(tf.matmul(input, self.fx) + tf.matmul(output, self.fm) + self.fb)
input_gate = tf.sigmoid(tf.matmul(input, self.ix) + tf.matmul(output, self.im) + self.ib)
update = tf.matmul(input, self.cx) + tf.matmul(output, self.cm) + self.cb
state = state * forget_gate + tf.tanh(update) * input_gate
output_gate = tf.sigmoid(tf.matmul(input, self.ox) + tf.matmul(output, self.om) + self.ob)
return output_gate * tf.tanh(state), state

def loss_func(self):
outputs = list()
output = self.saved_output
state = self.saved_state
for i in self.train_data:
output, state = self._run(i, output, state)
outputs.append(output)
# finnaly, the length of outputs is num_unrollings
with tf.control_dependencies([
self.saved_output.assign(output),
self.saved_state.assign(state)
]):
# concat(0, outputs) to concat the list of output on the dim 0
# the length of outputs is batch_size
logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), self.w, self.b)
# the label should fix the size of ouputs
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(
labels=tf.concat(self.train_label, 0),
logits=logits))
train_prediction = tf.nn.softmax(logits)
return logits, loss, train_prediction


lstm cell内部的模型结构

LSTM的变量分析

• x_t: 该LSTM cell的输入向量
• h_t: 该LSTM cell的输出向量
• c_t: 该LSTM cell的状态向量
• W, U 和 b：参数矩阵和向量
• f_t, i_t和 o_t都是阀门向量
其中:
• f_t为忘记阀门向量。它表示过去旧的信息的记忆权重（0就是应该要忘记，1就是要保留的）
• i_t为输入阀门。它表示接受新内容的权重是多少（0就是应该要忘记，1就是要保留的）
• o_t为输入阀门，它表示输出的变量应该是多少

• ## 在main.py的辅助函数


def get_optimizer(loss):
global_step = tf.Variable(0)
learning_rate = tf.train.exponential_decay(
10.0, global_step, 5000, 0.1, staircase=True)
# 为了避免梯度爆炸的问题，我们求出梯度的二范数。
# 然后判断该二范数是否大于1.25，若大于，则变成
# 将刚刚求得的梯度组装成相应的梯度下降法
return optimizer, learning_rate

def logprob(predictions, labels):
# 计算交叉熵
predictions[predictions < 1e-10] = 1e-10
return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]


• ## 训练

### 定义好数据流和模型



train_batcher = BatchGenerator(text=train_text, batch_size=config.batch_size, num_unrollings=config.num_unrollings)
vaild_batcher = BatchGenerator(text=valid_text, batch_size=1, num_unrollings=1)

# 定义训练数据由num_unrollings个占位符组成
train_data = list()
for _ in range(config.num_unrollings + 1):
train_data.append(
tf.placeholder(tf.float32, shape=[config.batch_size, config.vocabulary_size]))

train_input = train_data[:config.num_unrollings]
train_label= train_data[1:]

# define the lstm train model
model = LSTM_Cell(
train_data=train_input,
train_label=train_label)
# get the loss and the prediction
logits, loss, train_prediction = model.loss_func()
optimizer, learning_rate = get_optimizer(loss)


### 定义样本


# 定义样本(通过训练后的rnn网络自动生成文字)的输入,输出,重置
sample_input = tf.placeholder(tf.float32, shape=[1, config.vocabulary_size])
save_sample_output = tf.Variable(tf.zeros([1, config.num_nodes]))
save_sample_state = tf.Variable(tf.zeros([1, config.num_nodes]))
reset_sample_state = tf.group(
save_sample_output.assign(tf.zeros([1, config.num_nodes])),
save_sample_state.assign(tf.zeros([1, config.num_nodes])))

sample_output, sample_state = model._run(
sample_input, save_sample_output, save_sample_state)
with tf.control_dependencies([save_sample_output.assign(sample_output),
save_sample_state.assign(sample_state)]):
# 生成样本
sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, model.w, model.b))


### 开始训练


# training
with tf.Session() as session:
tf.global_variables_initializer().run()
print("Initialized....")
mean_loss = 0
for step in range(config.num_steps):
batches = train_batcher.next()
feed_dict = dict()
for i in range(config.num_unrollings + 1):
feed_dict[train_data[i]] = batches[i]
_, l, predictions, lr = session.run([optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
# 计算每一批数据的平均损失
mean_loss += l
if step % config.summary_frequency == 0:
if step > 0:
mean_loss = mean_loss / config.summary_frequency
print('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
mean_loss = 0
labels = np.concatenate(list(batches)[1:])
print('Minibatch perplexity: %.2f' % float(
np.exp(logprob(predictions, labels))))
if step % (config.summary_frequency * 10) == 0:
# Generate some samples.
print('=' * 80)
for _ in range(5):
feed = sample(random_distribution())
sentence = characters(feed)[0]
reset_sample_state.run()
for _ in range(79):
prediction = sample_prediction.eval({sample_input: feed})
feed = sample(prediction)
sentence += characters(feed)[0]
print(sentence)
print('=' * 80)
reset_sample_state.run()


# 总结

1. 这仅仅是一个lstm深入理解当中的公式和原理（但没有证明它的收敛性和长期依赖性），并且熟悉tf的一些操作。
2. 这里用one-hot作为词向量的方法是不行的，假如要提高准确率的话，就需要使用word2vec这些东西来表示每个字符（单词）的向量。