利用莎士比亚数据集进行RNN文本生成的训练

利用莎士比亚数据集进行RNN文本生成的训练

import tensorflow as tf
import numpy as np
from tensorflow import keras
import pandas as pd
import sklearn
import sys
import os
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.preprocessing import StandardScaler

print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__,module.__version__)

#莎士比亚数据集:https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
input_filepath = "./shakespeare.txt"
text = open(input_filepath,'r').read()
print(len(text))
print(text[0:100])

#1.生成词表
#2.映射 char -->id
#3.data -->id_data
#4.abcd -->bcd<eos>:预测下一个字符
vocab = sorted(set(text))
print(len(vocab))
print(vocab)

char2idx = {char:idx for idx, char in enumerate(vocab)}
print(char2idx)
idx2char = np.array(vocab)
print(idx2char)

#对text中每个字符都做一个映射
text_as_int = np.array([char2idx[c] for c in text])
print(text_as_int[0:10])
print(text[0:10])


# 定义输入输出函数
def split_input_target(id_text):
    """
    abcde -->输入abcd,输出bcde
    """
    return id_text[0:-1], id_text[1:]


# 将it_text转为dataset
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
seq_length = 100
seq_dataset = char_dataset.batch(seq_length + 1, drop_remainder=True)  # 当做batch操作时,如果最后一个长度不都,就丢掉
# 取出ch_id对应的字符
for ch_id in char_dataset.take(2):
    print(ch_id, idx2char[ch_id.numpy()])
# 取出seq_id对应的字符
for seq_id in seq_dataset.take(2):
    print(seq_id)
    print(repr(' '.join(idx2char[seq_id.numpy()])))

seq_dataset = seq_dataset.map(split_input_target)
for item_input,item_output in seq_dataset:
    print(item_input.numpy())
    print(item_output.numpy())

batch_size = 64
buffer_size = 10000
seq_dataset = seq_dataset.shuffle(buffer_size).batch(
    batch_size,drop_remainder=True)

#定义模型
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024
#模型函数
def build_model(vocab_size,embedding_dim,rnn_units,batch_size):
    model = keras.models.Sequential([
        keras.layers.Embedding(vocab_size,embedding_dim,
                              batch_input_shape = [batch_size,None]),
        keras.layers.SimpleRNN(units = rnn_units,
                              return_sequences=True),
        keras.layers.Dense(vocab_size),])
    return model

model = build_model(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=batch_size)

model.summary()


for input_example_batch,target_example_batch in seq_dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)


#随机采样
#在计算分类任务softmax之前的那个值就是logits
sample_indices = tf.random.categorical(logits=example_batch_predictions[0],
                     num_samples = 1)
print(sample_indices)
#将(100,1)转换为(100,)形式
sample_indices = tf.squeeze(sample_indices,axis=-1)
print(sample_indices)

#定义模型的损失函数
def loss(labels,logits):
    return keras.losses.sparse_categorical_crossentropy(
            labels,logits,from_logits=True)

model.compile(optimizer = 'adam', loss = loss)
example_loss = loss(input_example_batch,example_batch_predictions)
print(example_loss.shape)
print(example_loss.numpy().mean())

# 保存模型
output_dir = "./text_generation_checkpoints"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

checkpoint_prefix = os.path.join(output_dir, 'ckpt_{epochs}')
checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True, )

epochs = 100
history = model.fit(seq_dataset, epochs=epochs,
                    callbacks=[checkpoint_callback])



#导入模型
model2 = build_model(vocab_size,embedding_dim,
                    rnn_units,
                    batch_size=1)
model2.load_weights(tf.train.latest_checkpoint(output_dir))
# 1:指一个样本
model2.build(tf.TensorShape([1,None]))


# 文本生成的流程
# start ch sequence A,
# A -->model -->b
# A.append(b) -->B -->model -->c -->B.appden(c) -->C(abc).....
def generate_text(model, start_string, num_generate=1000):
    input_eval = [char2idx[ch] for ch in start_string]
    # 维度扩展,因为模型的输入时一个[1,None]的矩阵,而此时是一维的
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []
    model.reset_states()

    for _ in range(num_generate):
        # 1.model inference --> prediction
        # 2.sample --> ch --> text_generated
        # 3.update input_eval

        # predictions : [batch_size,input_eval_len,vocab_size]
        predictions = model(input_eval)
        # 去掉第一维: [input_eval_len,vocab_size]
        predictions = tf.squeeze(predictions, 0)
        # predictions : [input_eval_len,1]
        predicted_id = tf.random.categorical(
            predictions, num_samples=1)[-1, 0].numpy()
        text_generated.append(idx2char[predicted_id])
        input_eval = tf.expand_dims([predicted_id], 0)
        return start_string + ' '.join(text_generated)


new_text = generate_text(models, "All: ")
print(new_text)
发布了18 篇原创文章 · 获赞 0 · 访问量 171
展开阅读全文

没有更多推荐了,返回首页

©️2019 CSDN 皮肤主题: 游动-白 设计师: 上身试试

分享到微信朋友圈

×

扫一扫,手机浏览