基于莎士比亚文本数据,实现循环神经网络(RNN)文本数据生成。
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)
# 读取文件
input_filepath = "./shakespeare.txt"
text = open(input_filepath,'r').read()
print(len(text))
print(text[0:100])
# 数据处理
# 1.generate vocab
# 2.build mapping: char -> id
# 3. data -> id_data
# 4. 输入(abcd) -> 输出(bcd<eof>),预测下一个字符的模型
vocab = sorted(set(text)) # sorted排序(set去重)
print(len(vocab))
print(vocab)
# char到index的映射
char2idx = {char:idx for idx,char in enumerate(vocab)}
print(char2idx)
# index到char的映射
idx2char = np.array(vocab)
print(idx2char)
# 获得text的index列表
text_as_int = np.array([char2idx[c] for c in text])
print(text_as_int[0:10])
print(text[0:10])
def split_input_target(id_text):
'''abcde -> abcd,bcde'''
return id_text[0:-1],id_text[1:] #输出是输入的下一个字符
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int) #转成dataset
seq_length = 100
seq_dataset = char_dataset.batch(seq_length + 1,drop_remainder = True) # drop_remainder = True: 如果最后一组不够,则丢弃
for ch_id in char_dataset.take(2):
print(ch_id,idx2char[ch_id.numpy()])
for seq_id in seq_dataset.take(2):
print(seq_id)
print(repr(''.join(idx2char[seq_id.numpy()]))) #repr显示字符
seq_dataset = seq_dataset.map(split_input_target) #map方法
for item_input, item_output in seq_dataset.take(2):
print(item_input.numpy())
print(item_output.numpy())
batch_size = 64
buffer_size = 10000
seq_dataset = seq_dataset.shuffle(buffer_size).batch(batch_size,drop_remainder=True)
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024
# 建立模型函数
def build_model(vocab_size,embedding_dim,rnn_units,batch_size):
model = keras.models.Sequential([
keras.layers.Embedding(vocab_size,embedding_dim,
batch_input_shape = [batch_size,None]),
keras.layers.SimpleRNN(units = rnn_units,return_sequences = True),
keras.layers.Dense(vocab_size)
])
return model
model = build_model(vocab_size=vocab_size,embedding_dim=embedding_dim,rnn_units=rnn_units,batch_size=batch_size)
model.summary()
# 显示模型输出
for input_example_batch,target_example_batch in seq_dataset.take(1):
example_batch_predictions = model(input_example_batch)
print(example_batch_predictions.shape)
# 1. random sampling. 65生成概率分布
# 2.greedy策略(使用输出的概率), random策略(随机概率分类)
sample_indices = tf.random.categorical(logits = example_batch_predictions[0],num_samples = 1)
#(100,65) -> (100,1)
sample_indices = tf.squeeze(sample_indices,axis=-1)
print(sample_indices)
print("Input:",repr(''.join(idx2char[input_example_batch[0]])))
print()
print("Output",repr(''.join(idx2char[target_example_batch[0]])))
print()
print("Predictions",repr(''.join(idx2char[sample_indices])))
def loss(labels,logits):
return keras.losses.sparse_categorical_crossentropy(
labels,logits,from_logits=True)
model.compile(optimizer='adam',loss=loss)
example_loss = loss(target_example_batch,example_batch_predictions)
print(example_loss.shape)
print(example_loss.numpy().mean())
output_dir = "./text_generation_checkpoints"
if not os.path.exists(output_dir):
os.mkdir(output_dir)
checkpoint_prefix = os.path.join(output_dir,'ckpt_{epoch}') # 保存 checkpoint
checkpoint_callback = keras.callbacks.ModelCheckpoint(
filepath = checkpoint_prefix,
save_weights_only=True)
epochs = 3
history = model.fit(seq_dataset,epochs=epochs,callbacks=[checkpoint_callback])
tf.train.latest_checkpoint(output_dir) # 查看最新的模型
# 从CheckPoint中载入模型
model2 = build_model(vocab_size,embedding_dim,rnn_units,batch_size = 1) # batch_size = 1文本生成每次只生成一个句子。
model2.load_weights(tf.train.latest_checkpoint(output_dir)) # 载入权重
model2.build(tf.TensorShape([1,None])) #设置输入size
model2.summary()
# 文本生成流程(变长)
# start char sequence A ,
# A --> model -->b
# A.append(b) -> B
# B --> model -->c
# B.append(c) -> C
# ...
def generate_text(model,start_string,num_generate = 1000):
input_eval = [char2idx[ch] for ch in start_string] #获得index 输入
input_eval = tf.expand_dims(input_eval,0) #扩展维度
text_generated = []
model.reset_states()
for _ in range(num_generate):
# model inference -> predictions
# sample -> char -> text_generated
# unpdate input_eval
# predictions: [batch_size,input_eval_len,vocab_size]
predictions = model(input_eval)
# predictions: [input_eval_len,vocab_size]
predictions = tf.squeeze(predictions,0)
# predicted_ids:[input_eval_len,1]
# 输入(a b c) --> 预测(b c d),只需最后一个
predicted_id = tf.random.categorical(predictions,num_samples=1)[-1,0].numpy()
text_generated.append(idx2char[predicted_id])
# s,x --> rnn -> s,y, 只需最后一个predicted_id
input_eval = tf.expand_dims([predicted_id],0)
return start_string+''.join(text_generated)
new_text = generate_text(model2,"All:")
print(new_text)