利用lstm生成文本——Character-RNN的介绍

最新推荐文章于 2024-08-08 07:40:37 发布

dxk_093812

最新推荐文章于 2024-08-08 07:40:37 发布

阅读量1.1k

点赞数 1

分类专栏：深度学习文章标签： LSTM RNN 循环神经网络 TensorFlow 文本生成

本文链接：https://blog.csdn.net/dxk_093812/article/details/84936727

版权

深度学习专栏收录该内容

1 篇文章 0 订阅

订阅专栏

Char-RNN项目是由Andrej Karpathy提出的，来自论文The Unreasonable Effectiveness of Recurrent Neural Networks。RNN具有强大的时序记忆能力，擅长处理序列问题。Char-RNN模型就是从字符的维度上，让机器生成文本。利用Char-RNN可以用来写诗、写小说等。

Char-RNN的源代码是使用Torch框架的，地址在这：char-cnn(torch)。不过，我们这篇文章想来讲讲它在TensorFlow框架中的运行情况。当然，已经有人将代码移植到TensorFlow中——char-rnn(tensorflow)。我们直接使用这个代码。

代码分为四个模块：

train.py:
samplepy:
read_utils.py:
model.py:

train.py

该模块就是项目的入口，调用model进行训练。

首先是模型中使用的参数。

import tensorflow as tf
from read_utils import TextConverter, batch_generator
from model import CharRNN
import os
import codecs   

FLAGS = tf.flags.FLAGS            #定义命令行参数    

tf.flags.DEFINE_string('name', 'default', 'name of the model')   #第一个为参数名称，第二个为参数默认值，第三个为参数描述
tf.flags.DEFINE_integer('num_seqs', 100, 'number of seqs in one batch') #表示一个batch中输入信号序列的个数
tf.flags.DEFINE_integer('num_steps', 100, 'length of one seq')   #表示一个信号序列的长度
tf.flags.DEFINE_integer('lstm_size', 128, 'size of hidden state of lstm') #隐藏层节点数量，即lstm中cell中的state数量
tf.flags.DEFINE_integer('num_layers', 2, 'number of lstm layers')        #rnn的深度
tf.flags.DEFINE_boolean('use_embedding', False, 'whether to use embedding') #英文字符采用one-hot编码，中文字符使用embedding
tf.flags.DEFINE_integer('embedding_size', 128, 'size of embedding')   #使用embedding后向量的嵌入维度
tf.flags.DEFINE_float('learning_rate', 0.001, 'learning_rate')        
tf.flags.DEFINE_float('train_keep_prob', 0.5, 'dropout rate during training')
tf.flags.DEFINE_string('input_file', '', 'utf8 encoded text file')    #用于训练的文本
tf.flags.DEFINE_integer('max_steps', 100000, 'max steps to train')
tf.flags.DEFINE_integer('save_every_n', 1000, 'save the model every n steps')
tf.flags.DEFINE_integer('log_every_n', 10, 'log to the screen every n steps')
tf.flags.DEFINE_integer('max_vocab', 3500, 'max char number')#中文字符较多，embedding前需要进行onehot编码，根据字符频数降序排列，取前3500个

对数据进行预处理。调用read_utils.py模块中的文本转换类TextConverter，获取经过频数挑选的字符并且得到相应的index。调用batch_generator函数得到一个batch生成器。

def main(_):
    model_path = os.path.join('model', FLAGS.name)  #训练模型的路径
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
    with codecs.open(FLAGS.input_file, encoding='utf-8') as f:    #使用codecs打开文件
        text = f.read()
    converter = TextConverter(text, FLAGS.max_vocab)  #文本转换类，
    converter.save_to_file(os.path.join(model_path, 'converter.pkl'))    #将经过频数挑选的字符序列化保存

    arr = converter.text_to_arr(text)                      #得到每个字符的index
    g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps)  #得到一个batch生成器
    print(converter.vocab_size)               #vocab_size表示字符的数量

数据处理完毕后，调用model.py模块的CharRNN类构造循环神经网络，最后调用train()函数对神经网络进行训练

    model = CharRNN(converter.vocab_size,   #字符分类的数量
                    num_seqs=FLAGS.num_seqs, #一个batch中的序列数
                    num_steps=FLAGS.num_steps, #一个序列中的字符数
                    lstm_size=FLAGS.lstm_size,  #每个cell的节点数量
                    num_layers=FLAGS.num_layers,  #RNN的层数
                    learning_rate=FLAGS.learning_rate,  #学习速率
                    train_keep_prob=FLAGS.train_keep_prob,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size
                    )
    model.train(g,
                FLAGS.max_steps,
                model_path,
                FLAGS.save_every_n,
                FLAGS.log_every_n,
                )


if __name__ == '__main__':
    tf.app.run()

read_utils.py

该模块为数据预处理的实现模块。主要是一个文本转换类和batch生成器，文本转换类是对字符按频数排序，取频数高的若干字符，并且得到相应的index数组。

import numpy as np
import copy        #引入该模块的浅拷贝，不改变源对象的值
import time
import tensorflow as tf
import pickle      #该模块能序列化对象并保存在磁盘中


class TextConverter(object):        #文本转换类，将word与id进行转换
    def __init__(self, text=None, max_vocab=5000, filename=None):
        if filename is not None:
            with open(filename, 'rb') as f:
                self.vocab = pickle.load(f)
        else:
            vocab = set(text)      #组成text的所有字符，重复的被删除
            print(len(vocab))
            # max_vocab_process
            vocab_count = {}
            for word in vocab:
                vocab_count[word] = 0         #初始化
            for word in text:
                vocab_count[word] += 1
            vocab_count_list = []            #记录每个词出现次数的列表
            for word in vocab_count:
                vocab_count_list.append((word, vocab_count[word]))
            vocab_count_list.sort(key=lambda x: x[1], reverse=True)        #根据出现的次数排序，从大到小
            if len(vocab_count_list) > max_vocab:               #若字符数大于允许的最大数，进行截断处理
                vocab_count_list = vocab_count_list[:max_vocab]
            vocab = [x[0] for x in vocab_count_list]            #得到字符出现次数排名前max_vocab的列表，无重复    
            self.vocab = vocab

        self.word_to_int_table = {c: i for i, c in enumerate(self.vocab)} #得到字符:字符index,enumerate将可循环序列sequence以start开始分别列出字符index和字符
        self.int_to_word_table = dict(enumerate(self.vocab))     #得到字典，字符index:字符

    @property
    def vocab_size(self):             #得到字符的个数
        return len(self.vocab) + 1

    def word_to_int(self, word):                      #返回字符的index
        if word in self.word_to_int_table:
            return self.word_to_int_table[word]
        else:
            return len(self.vocab)

    def int_to_word(self, index):                   #由index得到相应的字符
        if index == len(self.vocab):
            return '<unk>'
        elif index < len(self.vocab):
            return self.int_to_word_table[index]
        else:
            raise Exception('Unknown index!')

    def text_to_arr(self, text):                #将text中的字符转换为对应index的数组
        arr = []
        for word in text:
            arr.append(self.word_to_int(word))
        return np.array(arr)

    def arr_to_text(self, arr):                 #将index数组转换为对应的字符数组
        words = []
        for index in arr:
            words.append(self.int_to_word(index))
        return "".join(words)

    def save_to_file(self, filename):           #将文本进行序列化存储
        with open(filename, 'wb') as f:
        	pickle.dump(self.vocab, f)

batch生成器根据得到的index数组，生成batch，这里需要特别理解生成文本的RNN的输入及输出是什么形式的。

def batch_generator(arr, n_seqs, n_steps):  #此函数用于生成batch
	    arr = copy.copy(arr)                    #浅拷贝
	    batch_size = n_seqs * n_steps          #每个batch的大小，序列数*序列的长度
	    n_batches = int(len(arr) / batch_size)   #得到batch的个数
	    arr = arr[:batch_size * n_batches]
	    arr = arr.reshape((n_seqs, -1))         #将arr重塑为二维数组，其中一位大小n_seqs，-1表示另一维推断出
	    while True:
	        np.random.shuffle(arr)              #将二维数组的第二维随机打乱
	        for n in range(0, arr.shape[1], n_steps):     #shape为读取矩阵的行数       
	            x = arr[:, n:n + n_steps]
	            y = np.zeros_like(x)                     #zeros_like生成一个与x同大小全0矩阵
	            y[:, :-1], y[:, -1] = x[:, 1:], x[:, 0]   #x[:, 1:]取第二列后的数据，x[:, 0]取第一列的数据，
															#y[:, :-1]为第一列到倒数第二列的数据，y[:, -1]为倒数第一列的数据
	            yield x, y

model.py

该模块就是循环神经网络的构造及训练模块。

首先理解CharRNN类中的参数。

from __future__ import print_function  #加上这句之后，即使在python2.X，使用print就得像python3.X那样加括号使用
import tensorflow as tf
import numpy as np
import time
import os

'''
从概率最大的前n个字符中，根据概率分布随机挑选一个字符作为下一个字符
preds为预测各字符在下一次出现的概率序列
'''
def pick_top_n(preds, vocab_size, top_n=5):
    p = np.squeeze(preds)                 #去除size为1的维度
    # 将除了top_n个预测值的位置都置为0
    p[np.argsort(p)[:-top_n]] = 0
    # 归一化概率
    p = p / np.sum(p)
    # 随机选取一个字符，1表示采样中有重复值
    c = np.random.choice(vocab_size, 1, p=p)[0]   #从vocab_size个字符中选择一个字符，概率分布为p
    return c


class CharRNN:
    def __init__(self, num_classes, num_seqs=64, num_steps=50,
                 lstm_size=128, num_layers=2, learning_rate=0.001,
                 grad_clip=5, sampling=False, train_keep_prob=0.5, use_embedding=False, embedding_size=128):
        if sampling is True:
            num_seqs, num_steps = 1, 1           #评测时用的参数
        else:
            num_seqs, num_steps = num_seqs, num_steps

        self.num_classes = num_classes         #字符的数量
        self.num_seqs = num_seqs               #batch中序列的数量
        self.num_steps = num_steps             #每个序列的长度
        self.lstm_size = lstm_size             #隐藏层节点数量
        self.num_layers = num_layers           #rnn的深度
        self.learning_rate = learning_rate     #学习率
        self.grad_clip = grad_clip
        self.train_keep_prob = train_keep_prob
        self.use_embedding = use_embedding     #是否需要embedding层，中文等字符较多，需要embedding，否则矩阵稀疏
        self.embedding_size = embedding_size   #使用的embedding的嵌入维度

        tf.reset_default_graph()  #用于清除默认图形堆栈并重置全局默认图形
        self.build_inputs()
        self.build_lstm()
        self.build_loss()
        self.build_optimizer()
        self.saver = tf.train.Saver()

定义输入层，这里要注意的是对于中英文采用不同的处理办法。由于英文字符种类有限，因此我们直接使用one-hot编码。而中文字符种类繁多，不能像英文那样用少量字符就可实现全部表示，若使用one-hot编码的话会造成稀疏的问题。因此对于中文，我们使用embedding层。

tf.nn.embedding_lookup(embedding, self.inputs)就是选取一个张量里面索引对应的元素。比如inputs=[1,3,5]，则找出embeddings中第1，3，5行，组成一个tensor返回。本例中inputs为一个二维数组，则返回的便是一个三维张量。

    def build_inputs(self):            #定义输入层
        with tf.name_scope('inputs'):
            self.inputs = tf.placeholder(tf.int32, shape=(
                self.num_seqs, self.num_steps), name='inputs')
            self.targets = tf.placeholder(tf.int32, shape=(
                self.num_seqs, self.num_steps), name='targets')
            self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')

            # 对于中文，需要使用embedding层
            # 英文字母没有必要用embedding层
            if self.use_embedding is False:
                self.lstm_inputs = tf.one_hot(self.inputs, self.num_classes)  #将inputs转化为one-hot类型数据输出(维度为num_seqs*num_steps*num_classes)
            else:
                with tf.device("/cpu:0"):
                    embedding = tf.get_variable('embedding', [self.num_classes, self.embedding_size])
                    self.lstm_inputs = tf.nn.embedding_lookup(embedding, self.inputs) #lstm_inputs维度为num_seqs*num_steps*embedding_size，每个单词得到一个向量

定义深层神经网络。

dropout是TensorFlow中防止过拟合的一个函数，一般用在全连接层。它的原理是在不同的训练过程中随机扔掉一部分神经元，以一定的概率让其不更新权值。

rnn中使用DropoutWrapper类来实现dropout功能。该类通过两个参数控制dropout的概率，一个参数为input_keep_prob，它用来控制输入的dropout概率。另一个为output_keep_prob，它可以用来控制输出的dropout概率。

    def build_lstm(self):             #定义深层循环神经网络
        # 创建单个cell并堆叠多层
        def get_a_cell(lstm_size, keep_prob):
            lstm = tf.nn.rnn_cell.BasicLSTMCell(lstm_size)#定义一个基本的LSTM结构作为循环体的基础结构
            drop = tf.nn.rnn_cell.DropoutWrapper(lstm, output_keep_prob=keep_prob)#实现dropout功能
            return drop

        with tf.name_scope('lstm'):
            cell = tf.nn.rnn_cell.MultiRNNCell( #通过MultiRNNCell类实现深层循环神经网络中每一时刻的前向传播过程
                [get_a_cell(self.lstm_size, self.keep_prob) for _ in range(self.num_layers)]
            )
            self.initial_state = cell.zero_state(self.num_seqs, tf.float32)  #状态初始化，h_0

            # 通过dynamic_rnn对cell展开时间维度
            self.lstm_outputs, self.final_state = tf.nn.dynamic_rnn(cell, self.lstm_inputs, initial_state=self.initial_state)

            # 通过lstm_outputs得到概率
            seq_output = tf.concat(self.lstm_outputs, 1)         #concat是连接两个矩阵的操作,1表示列连接,,但是输出发现dim没有任何改变，值有变化？？？？？？
            x = tf.reshape(seq_output, [-1, self.lstm_size])    #将每个batch的每个state拼接成batch_size*lstm_size

            with tf.variable_scope('softmax'):
                softmax_w = tf.Variable(tf.truncated_normal([self.lstm_size, self.num_classes], stddev=0.1))
                softmax_b = tf.Variable(tf.zeros(self.num_classes))

			# 定义输出：softmax 归一化
            self.logits = tf.matmul(x, softmax_w) + softmax_b
            self.proba_prediction = tf.nn.softmax(self.logits, name='predictions')

定义全局损失，使用tf.nn.softmax_cross_entropy_with_logits()函数计算交叉熵。

    def build_loss(self):
        with tf.name_scope('loss'):
            y_one_hot = tf.one_hot(self.targets, self.num_classes)
            y_reshaped = tf.reshape(y_one_hot, self.logits.get_shape())
            loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=y_reshaped)
            self.loss = tf.reduce_mean(loss)

进行梯度裁剪。在进行梯度计算的过程中，可能会造成梯度爆炸的情况，也就是偏导数很大。我们都知道，损失函数的值是沿着梯度的方向呈下降的趋势，若梯度大的话，就会造成函数值跳来跳去的现象，不能收敛最值。

我们使用tf.clip_by_global_norm()函数对梯度进行裁剪，论文提出对梯度的L2范数进行裁剪，也就是所有参数偏导数的平方和再开方。

    def build_optimizer(self):
        # 使用clipping gradients，避免梯度计算迭代过程变化过大导致梯度爆炸现象
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), self.grad_clip)   #gradients为计算梯度(loss对tvars参数求偏导)，梯度修剪
        train_op = tf.train.AdamOptimizer(self.learning_rate)   #定义一个Adam优化器
        self.optimizer = train_op.apply_gradients(zip(grads, tvars))  #在优化器中应用梯度修剪，zip将多个序列合并成元组


    def train(self, batch_generator, max_steps, save_path, save_every_n, log_every_n):
        self.session = tf.Session()
        with self.session as sess:
            sess.run(tf.global_variables_initializer())
            # Train network
            step = 0
            new_state = sess.run(self.initial_state)
            for x, y in batch_generator:
                step += 1
                start = time.time()
                feed = {self.inputs: x,
                        self.targets: y,
                        self.keep_prob: self.train_keep_prob,
                        self.initial_state: new_state}
                batch_loss, new_state, _ = sess.run([self.loss,
                                                     self.final_state,
                                                     self.optimizer],
                                                    feed_dict=feed)

                end = time.time()
                # control the print lines
                if step % log_every_n == 0:
                    print('step: {}/{}... '.format(step, max_steps),
                          'loss: {:.4f}... '.format(batch_loss),
                          '{:.4f} sec/batch'.format((end - start)))
                if (step % save_every_n == 0):
                    self.saver.save(sess, os.path.join(save_path, 'model'), global_step=step)
                if step >= max_steps:
                    break
            self.saver.save(sess, os.path.join(save_path, 'model'), global_step=step)

sample()方法是用一个字符生成一段文本，直接看注释把。

    def sample(self, n_samples, prime, vocab_size):
        samples = [c for c in prime]
        sess = self.session
        new_state = sess.run(self.initial_state)
        preds = np.ones((vocab_size, ))  # for prime=[]
        for c in prime:
            x = np.zeros((1, 1))
            # 输入单个字符
            x[0, 0] = c
            feed = {self.inputs: x,
                    self.keep_prob: 1.,
                    self.initial_state: new_state}
            preds, new_state = sess.run([self.proba_prediction, self.final_state],
                                        feed_dict=feed)
                                        
            print(preds)  #最后一个字符的输出, preds的维度为1*68
            #print(preds.shape)

        c = pick_top_n(preds, vocab_size)
        # 添加字符到samples中
        samples.append(c)

        # 不断生成字符，直到达到指定数目
        for i in range(n_samples):
            x = np.zeros((1, 1))
            x[0, 0] = c
            feed = {self.inputs: x,
                    self.keep_prob: 1.,
                    self.initial_state: new_state}
            preds, new_state = sess.run([self.proba_prediction, self.final_state],
                                        feed_dict=feed)

            c = pick_top_n(preds, vocab_size)
            samples.append(c)

        return np.array(samples)   


    def load(self, checkpoint):
        self.session = tf.Session()
        self.saver.restore(self.session, checkpoint)
        print('Restored from: {}'.format(checkpoint))

sample.py

该模块利用模型生成文本，主要调用的是model.py中的sample()函数。这也没什么讲的。

import tensorflow as tf
from read_utils import TextConverter
from model import CharRNN
import os
from IPython import embed

FLAGS = tf.flags.FLAGS

tf.flags.DEFINE_integer('lstm_size', 128, 'size of hidden state of lstm')
tf.flags.DEFINE_integer('num_layers', 2, 'number of lstm layers')
tf.flags.DEFINE_boolean('use_embedding', False, 'whether to use embedding')
tf.flags.DEFINE_integer('embedding_size', 128, 'size of embedding')
tf.flags.DEFINE_string('converter_path', '', 'model/name/converter.pkl')
tf.flags.DEFINE_string('checkpoint_path', '', 'checkpoint path')
tf.flags.DEFINE_string('start_string', '', 'use this string to start generating')
tf.flags.DEFINE_integer('max_length', 30, 'max length to generate')


def main(_):
    FLAGS.start_string = FLAGS.start_string.encode('utf-8')
    converter = TextConverter(filename=FLAGS.converter_path)
    if os.path.isdir(FLAGS.checkpoint_path):
        FLAGS.checkpoint_path =\
            tf.train.latest_checkpoint(FLAGS.checkpoint_path)

    model = CharRNN(converter.vocab_size, sampling=True,
                    lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers,
                    use_embedding=FLAGS.use_embedding,
                    embedding_size=FLAGS.embedding_size)

    model.load(FLAGS.checkpoint_path)

    start = converter.text_to_arr(FLAGS.start_string)
    arr = model.sample(FLAGS.max_length, start, converter.vocab_size)
    print(converter.arr_to_text(arr))


if __name__ == '__main__':
    tf.app.run()

利用模型生成文本

训练：

python train.py \
  --input_file data/shakespeare.txt  \
  --name shakespeare \
  --num_steps 50 \
  --num_seqs 32 \
  --learning_rate 0.01 \
  --max_steps 20000

生成文本：

python sample.py \
  --converter_path model/shakespeare/converter.pkl \
  --checkpoint_path model/shakespeare/ \
  --max_length 1000

以下为生成的结果：

That maked him seal any to me of thy born, thou have
That hath shall be see to an arme of this heant.
Who say, the shurse as that the wing a panitions,
The matter to the fortunes we that answers;
Is his through how to the sure of the sate asson,
All men what there that, how with myself,
What that he shall so the tough the sure what he hustay.

PRANDIO:
How have thought that treasants: who shall boy,
Thou art the some may well, that as the world a through,
And, whose strunce thousand him of thee of my stard,
When his to me of him,--to my much to the
stand, he that and his mest more than the sanders the tomb:
Why, that the mad and may he hath stit her border hour
And marry a servant house to the mind the world
Where are this altience would say him borne.

SEFARNES:
I will there hath more to had so hard thinks
As stinger and, sir, there's a made in his say,
What seem at him, with the word of the finger horns and
they have sent him and takes an otise,
Is my both the stir of as the frou

可以看到生成的句子还不通顺，很多都不能称为一个句子，这还有很多可以改进的地方。

项目完整代码：char-rnn