利用莎士比亚数据集进行RNN文本生成的训练

利用莎士比亚数据集进行RNN文本生成的训练

import tensorflow as tf
import numpy as np
from tensorflow import keras
import pandas as pd
import sklearn
import sys
import os
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.preprocessing import StandardScaler

print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__,module.__version__)

#莎士比亚数据集:https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
input_filepath = "./shakespeare.txt"
text = open(input_filepath,'r').read()
print(len(text))
print(text[0:100])

#1.生成词表
#2.映射 char -->id
#3.data -->id_data
#4.abcd -->bcd<eos>:预测下一个字符
vocab = sorted(set(text))
print(len(vocab))
print(vocab)

char2idx = {char:idx for idx, char in enumerate(vocab)}
print(char2idx)
idx2char = np.array(vocab)
print(idx2char)

#对text中每个字符都做一个映射
text_as_int = np.array([char2idx[c] for c in text])
print(text_as_int[0:10])
print(text[0:10])


# 定义输入输出函数
def split_input_target(id_text):
    """
    abcde -->输入abcd,输出bcde
    """
    return id_text[0:-1], id_text[1:]


# 将it_text转为dataset
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
seq_length = 100
seq_dataset = char_dataset.batch(seq_length + 1, drop_remainder=True)  # 当做batch操作时,如果最后一个长度不都,就丢掉
# 取出ch_id对应的字符
for ch_id in char_dataset.take(2):
    print(ch_id, idx2char[ch_id.numpy()])
# 取出seq_id对应的字符
for seq_id in seq_dataset.take(2):
    print(seq_id)
    print(repr(' '.join(idx2char[seq_id.numpy()])))

seq_dataset = seq_dataset.map(split_input_target)
for item_input,item_output in seq_dataset:
    print(item_input.numpy())
    print(item_output.numpy())

batch_size = 64
buffer_size = 10000
seq_dataset = seq_dataset.shuffle(buffer_size).batch(
    batch_size,drop_remainder=True)

#定义模型
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024
#模型函数
def build_model(vocab_size,embedding_dim,rnn_units,batch_size):
    model = keras.models.Sequential([
        keras.layers.Embedding(vocab_size,embedding_dim,
                              batch_input_shape = [batch_size,None]),
        keras.layers.SimpleRNN(units = rnn_units,
                              return_sequences=True),
        keras.layers.Dense(vocab_size),])
    return model

model = build_model(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=batch_size)

model.summary()


for input_example_batch,target_example_batch in seq_dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)


#随机采样
#在计算分类任务softmax之前的那个值就是logits
sample_indices = tf.random.categorical(logits=example_batch_predictions[0],
                     num_samples = 1)
print(sample_indices)
#将(100,1)转换为(100,)形式
sample_indices = tf.squeeze(sample_indices,axis=-1)
print(sample_indices)

#定义模型的损失函数
def loss(labels,logits):
    return keras.losses.sparse_categorical_crossentropy(
            labels,logits,from_logits=True)

model.compile(optimizer = 'adam', loss = loss)
example_loss = loss(input_example_batch,example_batch_predictions)
print(example_loss.shape)
print(example_loss.numpy().mean())

# 保存模型
output_dir = "./text_generation_checkpoints"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

checkpoint_prefix = os.path.join(output_dir, 'ckpt_{epochs}')
checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True, )

epochs = 100
history = model.fit(seq_dataset, epochs=epochs,
                    callbacks=[checkpoint_callback])



#导入模型
model2 = build_model(vocab_size,embedding_dim,
                    rnn_units,
                    batch_size=1)
model2.load_weights(tf.train.latest_checkpoint(output_dir))
# 1:指一个样本
model2.build(tf.TensorShape([1,None]))


# 文本生成的流程
# start ch sequence A,
# A -->model -->b
# A.append(b) -->B -->model -->c -->B.appden(c) -->C(abc).....
def generate_text(model, start_string, num_generate=1000):
    input_eval = [char2idx[ch] for ch in start_string]
    # 维度扩展,因为模型的输入时一个[1,None]的矩阵,而此时是一维的
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []
    model.reset_states()

    for _ in range(num_generate):
        # 1.model inference --> prediction
        # 2.sample --> ch --> text_generated
        # 3.update input_eval

        # predictions : [batch_size,input_eval_len,vocab_size]
        predictions = model(input_eval)
        # 去掉第一维: [input_eval_len,vocab_size]
        predictions = tf.squeeze(predictions, 0)
        # predictions : [input_eval_len,1]
        predicted_id = tf.random.categorical(
            predictions, num_samples=1)[-1, 0].numpy()
        text_generated.append(idx2char[predicted_id])
        input_eval = tf.expand_dims([predicted_id], 0)
        return start_string + ' '.join(text_generated)


new_text = generate_text(models, "All: ")
print(new_text)
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值