[代码]基于RNN的文本生成算法

“什么时候能自动生成博客?”

前言

跳过废话,直接看正文

RNN相对于传统的神经网络来说对于把握上下文之间的关系更为擅长,因此现在被大量用在自然语言处理的相关任务中,例如生成与训练文集相似的文字、序列标注、中文分词等。

此文列出两种基于RNN的文本生成算法,以供参考。


正文

基于字符的文本生成算法

此代码为keras的官方例子

'''Example script to generate text from Nietzsche's writings.
At least 20 epochs are required before the generated text
starts sounding coherent.
It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.
If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.
'''

from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys

start_time = time.time()
output_file_handler = open('out.log', 'w')
sys.stdout = output_file_handler

path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read().lower()
print('corpus length:', len(text))

chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1


# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)


def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# train the model, output generated text after each iteration
for iteration in range(1, 60):
    end_time = time.time()
    print 'training used time : ' + str(end_time - start_time)

    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(X, y, batch_size=128, nb_epoch=1)

    start_index = random.randint(0, len(text) - maxlen - 1)

    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x[0, t, char_indices[char]] = 1.

            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

结合word2vec的文本生成算法

此代码还未完成,将来我再抽空将它完成,这里只是给一个思路。
更多代码参考github

'''Example script to generate text using keras and word2vec

At least 20 epochs are required before the generated text
starts sounding coherent.

It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.

'''

from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from nltk import tokenize
import numpy as np
import random
import sys
import os
import nltk

import gensim, logging
import os
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# a memory-friendly iterator
class MySentences(object):
    def __init__(self, dirname, min_word_count_in_sentence = 1):
        self.dirname = dirname
        self.min_word_count_in_sentence = min_word_count_in_sentence;

    def process_line(self, line):
        words = line.split()
        return words

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                processed_line = self.process_line(line)
                if (len(processed_line) >= self.min_word_count_in_sentence):
                    yield processed_line
                else:
                    continue

def generate_word2vec_train_files(input_dir, output_dir, sentence_start_token, sentence_end_token, unkown_token, word_min_count, word2vec_size):
    print('generate_word2vec_train_files...')
    tmp_word2vec_model = gensim.models.Word2Vec(min_count = word_min_count, size = word2vec_size)
    original_sentences = MySentences(input_dir)
    tmp_word2vec_model.build_vocab(original_sentences)
    original_word2vec_vocab = tmp_word2vec_model.vocab

    make_dir_if_not_exist(output_dir)
    for fname in os.listdir(input_dir):
        output_file = open(os.path.join(output_dir, fname), 'w')
        line_count = 0
        for line in open(os.path.join(input_dir, fname)):
            line = line.strip(' -=:\"\'_*\n')
            if len(line) == 0:
                continue
            sentences = tokenize.sent_tokenize(line)
            for idx, sentence in enumerate(sentences):
                words = sentence.split()
                for word_idx, word in enumerate(words):
                    if word not in original_word2vec_vocab:
                        words[word_idx] = unkown_token#TODO
                sentence = " ".join(word for word in words)
                sentences[idx] = sentence_start_token + ' ' + sentence + ' ' + sentence_end_token + '\n'
            line_count += len(sentences)
            output_file.writelines(sentences)
        output_file.close()
        print("line_count", line_count)

def train_word2vec_model(dataset_dir, save_model_file, word_min_count, word2vec_size):
    print('train_word2vec_model...')
    word2vec_model = gensim.models.Word2Vec(min_count = word_min_count, size = word2vec_size)
    train_sentences = MySentences(dataset_dir)
    word2vec_model.build_vocab(train_sentences)
    sentences = MySentences(dataset_dir)
    word2vec_model.train(sentences)
    word2vec_model.save(save_model_file)
    return word2vec_model

def load_existing_word2vec_model(model_file_path):
    model =None
    if os.path.exists(model_file_path):
        print("load existing model...")
        model = gensim.models.Word2Vec.load(model_file_path)
    return model

def generate_rnn_train_files(input_dir, output_dir, fixed_sentence_len, unkown_token, sentence_start_token, sentence_end_token):
    print('generate_rnn_train_files...')
    make_dir_if_not_exist(output_dir)

    long_than_fixed_len_count = 0;
    total_sentence_count = 0;
    for fname in os.listdir(input_dir):
        output_file = open(os.path.join(output_dir, fname), 'w')
        for sentence in open(os.path.join(input_dir, fname)):
            sentence = sentence.strip('\n')
            total_sentence_count += 1
            words = sentence.split()
            len_of_sentence = len(words)
            if len_of_sentence > fixed_sentence_len:
                long_than_fixed_len_count += 1
                continue
            elif len_of_sentence < fixed_sentence_len:
                for i in range(0, fixed_sentence_len - len_of_sentence):
                    sentence = sentence + ' ' + sentence_end_token
            output_file.write(sentence + '\n')
        output_file.close()
    print ("sentence longer than fixed_len : %d / %d" %(long_than_fixed_len_count, total_sentence_count))

def train_rnn_model(dataset_dir, fixed_sentence_len, word2vec_size, word2vec_model):
    # build the model: a single LSTM
    print('Build RNN model...')
    rnn_model = Sequential()
    rnn_model.add(LSTM(128, input_shape=(fixed_sentence_len, word2vec_size)))
    rnn_model.add(Dense(word2vec_size))
    rnn_model.add(Activation('softmax'))

    optimizer = RMSprop(lr=0.01)
    rnn_model.compile(loss='categorical_crossentropy', optimizer=optimizer)

    print('Generating RNN train data...')
    X = [] #np.zeros((0, fixed_sentence_len, word2vec_size), dtype=np.float32)
    y = [] #np.zeros((0, word2vec_size), dtype=np.float32)
    sentences = MySentences(dataset_dir)
    for sentence in sentences:
        tmp_x = np.asarray([word2vec_model[w] for w in sentence[:-1]])
        tmp_y = np.asarray([word2vec_model[w] for w in sentence[1:]])
        tmp_x = np.zeros((fixed_sentence_len, word2vec_size), dtype=np.float32)
        for idx, word in enumerate(sentence):
            tmp_x[idx] = word2vec_model[word]
            X.append()
    # X, y = generate_rnn_train_data()
    print(X)
    print(y)
    print('Generate RNN train data end!')

    # rnn_model.fit()
    print('Build RNN model over!')

    return rnn_model

class Config:
    WORD2VEC_MODE_FILE = "./word2vec_model.model"
    ORIGINAL_TRAIN_DATASET_DIR = "./small_train_text"
    WORD2VEC_TRAIN_DATASET_DIR = "./small_word2vec_train_text"
    RNN_TRAIN_DATASET_DIR = "./small_rnn_train_text"
    SENTENCE_START_TOKEN = "SENTENCE_START_TOKEN"
    SENTENCE_END_TOKEN = "SENTENCE_END_TOKEN"
    UNKNOWN_TOKEN = "UNKNOWN_TOKEN"
    FIXED_SENTENCE_LEN = 30
    MIN_COUNT = 2;
    WORD2VEC_SIZE = 20;

def make_dir_if_not_exist(dirpath):
    if not os.path.exists(dirpath):
        os.mkdir(dirpath)

def main():

    # word2vec train
    word2vec_model = load_existing_word2vec_model(Config.WORD2VEC_MODE_FILE)

    if word2vec_model == None:
        generate_word2vec_train_files(
            Config.ORIGINAL_TRAIN_DATASET_DIR, Config.WORD2VEC_TRAIN_DATASET_DIR,
            Config.SENTENCE_START_TOKEN, Config.SENTENCE_END_TOKEN, Config.UNKNOWN_TOKEN, Config.MIN_COUNT, Config.WORD2VEC_SIZE)

        word2vec_model = train_word2vec_model(Config.WORD2VEC_TRAIN_DATASET_DIR, Config.WORD2VEC_MODE_FILE, Config.MIN_COUNT, Config.WORD2VEC_SIZE)

    # rnn train
    generate_rnn_train_files(
        Config.WORD2VEC_TRAIN_DATASET_DIR, Config.RNN_TRAIN_DATASET_DIR,
        Config.FIXED_SENTENCE_LEN, Config.UNKNOWN_TOKEN,
        Config.SENTENCE_START_TOKEN, Config.SENTENCE_END_TOKEN)

    rnn_model = train_rnn_model(Config.RNN_TRAIN_DATASET_DIR, Config.FIXED_SENTENCE_LEN, Config.WORD2VEC_SIZE, word2vec_model)

main()

# if __name__ == "__main__":
#     main()
# 

后记

就目前而言,利用基于RNN的文本生成算法虽然能够生成通顺的句子,却远远不能用来创作文章。因为RNN本质上还是基于词句在训练集中出现的概率来生成文本,这种暴力模仿的文本生成算法终究不是根本的解决之道,将来融合人工智能领域的其他的一些算法或许能够达到比较好的效果。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值