Keras(二十七)RNN实现文本生成

38 篇文章 2 订阅
35 篇文章 11 订阅

数据资源:莎士比亚数据集

一,处理数据

1,加载训练数据
# https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
input_filepath = "./shakespeare.txt"
text = open(input_filepath, 'r').read()
print(len(text))
print(text[0:100])
2,生成词库
"""
# 1. generate vocab
# 2. build mapping char->id
# 3. data -> id_data
# 4. abcd -> bcd<eos>
"""
vocab = sorted(set(text))
print(len(vocab))
print(vocab)
3,生成词库对应表
char2idx = {char:idx for idx, char in enumerate(vocab)}
print(char2idx)
4,将词库
idx2char = np.array(vocab)
print(idx2char)
5,将文本转化为数字
text_as_int = np.array([char2idx[c] for c in text])
print(text_as_int[0:10])
print(text[0:10])
6,将数据加载都dataset中,并处理数据
def split_input_target(id_text):
    """
    abcde -> abcd, bcde
    """
    return id_text[0:-1], id_text[1:]

# 将数据加载都dataset中
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

# 将数据集每个100个字符进行batch分序列
seq_length = 100
seq_dataset = char_dataset.batch(seq_length + 1,
                                 drop_remainder = True)
# 选择数据查看
for ch_id in char_dataset.take(2):
    print(ch_id, idx2char[ch_id.numpy()])

for seq_id in seq_dataset.take(2):
    print(seq_id)
    print(repr(''.join(idx2char[seq_id.numpy()])))
7,将数据分割成训练和两个部分
seq_dataset = seq_dataset.map(split_input_target)

for item_input, item_output in seq_dataset.take(1):
    print(item_input.numpy())
    print(item_output.numpy())
8,打乱数据,batch分组,batch_size=64
batch_size = 64
buffer_size = 10000

seq_dataset = seq_dataset.shuffle(buffer_size).batch(
    batch_size, drop_remainder=True)

二,构建模型

1,定义模型常量
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024
2,定义model模型
# # 1) 使用Sequential定义模型#######################################################
# def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
#     model = keras.models.Sequential([
#         keras.layers.Embedding(vocab_size, embedding_dim,
#                                 batch_input_shape = [batch_size, None]),
#         keras.layers.SimpleRNN(units = rnn_units,
#                                 # 意味着当前batch的输出状态会被当成下一个batch的初始化状态,
#                                 # 这个操作一般在文本长度比较长时会比较有用。比如样本A,在batch1中,
#                                 # 用了A中的第0-200个词语,在batch2中,要用A中的第200-400个词语。
#                                 #如果语料中有大量的长语句时。这个参数就有用。当然,这还需要batch的顺序不能被打乱。
#                                 stateful = True,
#                                 # new_s = f(w1*old_s + w2 * x)
#                                 # recurrent_initializer就是w1的初始化方法
#                                 # kernel_initializer就是w2的初始化方法。
#                                 # "glorot_uniform":设置内核初始化程序的种子值.
#                                 recurrent_initializer = 'glorot_uniform', 
#                                 return_sequences = True),
#         keras.layers.Dense(vocab_size),
#     ])
#     return model

# model = build_model(
#     vocab_size = vocab_size,
#     embedding_dim = embedding_dim,
#     rnn_units = rnn_units,
#     batch_size = batch_size)

# 2) Model定义模型###############################################################
inputs = keras.Input(batch_input_shape=(batch_size,None))
print(inputs.shape)
outputs = keras.layers.Embedding(vocab_size, embedding_dim)(inputs)
print(outputs.shape)
outputs = keras.layers.SimpleRNN(units = rnn_units,stateful = True,recurrent_initializer='glorot_uniform', 
                        return_sequences = True)(outputs)
print(outputs.shape)
outputs = keras.layers.Dense(vocab_size)(outputs)
print(outputs.shape)

model = Model(inputs, outputs)

###############################################################################
model.summary()
3,单个例子测试模型
for input_example_batch, target_example_batch in seq_dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)
    print(example_batch_predictions)

# random sampling.
# greedy, random.
# 测试单个例子的结果
sample_indices = tf.random.categorical(
    logits = example_batch_predictions[0], num_samples = 1)
print(sample_indices)
# (100, 65) -> (100, 1)
sample_indices = tf.squeeze(sample_indices, axis = -1)
print(sample_indices)

# 打印输入,目标,预测的结果
print("Input: ", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Output: ", repr("".join(idx2char[target_example_batch[0]])))
print()
print("Predictions: ", repr("".join(idx2char[sample_indices])))

三,定义损失函数和优化器

def loss(labels, logits):
    return keras.losses.sparse_categorical_crossentropy(
        labels, logits, from_logits=True)

# 定义优化器和自定义损失函数
model.compile(optimizer = 'adam', loss = loss)

# 测试计算单例的损失数
example_loss = loss(target_example_batch, example_batch_predictions)
print(example_loss.shape)
print(example_loss.numpy().mean())

四,callback模块-checkpoints

output_dir = "./text_generation_checkpoints"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
checkpoint_prefix = os.path.join(output_dir, 'ckpt_{epoch}')
checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True)

五,训练模型

epochs = 100
history = model.fit(seq_dataset, epochs = epochs,
                    callbacks = [checkpoint_callback])
# 会自动找到最近保存的变量文件
new_checkpoint = tf.train.latest_checkpoint(output_dir)  

六,定义预测模型

# 1,Model 定义预测模型 ##########################################################
inputs = keras.Input(batch_input_shape=(1,None))
print(inputs.shape)
outputs = keras.layers.Embedding(vocab_size, embedding_dim)(inputs)
print(outputs.shape)
outputs = keras.layers.SimpleRNN(units = rnn_units,stateful = True,recurrent_initializer='glorot_uniform', 
                        return_sequences = True)(outputs)
print(outputs.shape)
outputs = keras.layers.Dense(vocab_size)(outputs)
print(outputs.shape)

model2 = Model(inputs, outputs)
 
# # 2,Sequential 定义预测模型 #####################################################
# def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
#     model = keras.models.Sequential([
#         keras.layers.Embedding(vocab_size, embedding_dim,
#                                 batch_input_shape = [batch_size, None]),
#         keras.layers.SimpleRNN(units = rnn_units,
#                                 # 意味着当前batch的输出状态会被当成下一个batch的初始化状态,
#                                 # 这个操作一般在文本长度比较长时会比较有用。比如样本A,在batch1中,
#                                 # 用了A中的第0-200个词语,在batch2中,要用A中的第200-400个词语。
#                                 #如果语料中有大量的长语句时。这个参数就有用。当然,这还需要batch的顺序不能被打乱。
#                                 stateful = True,
#                                 # new_s = f(w1*old_s + w2 * x)
#                                 # recurrent_initializer就是w1的初始化方法
#                                 # kernel_initializer就是w2的初始化方法。
#                                 # "glorot_uniform":设置内核初始化程序的种子值.
#                                 recurrent_initializer = 'glorot_uniform', 
#                                 return_sequences = True),
#         keras.layers.Dense(vocab_size),
#     ])
#     return model

# model2 = build_model(vocab_size,
#                       embedding_dim,
#                       rnn_units,
#                       batch_size = 1)
# model2.build(tf.TensorShape([1, None]))
# ##############################################################################

# start ch sequence A,
# A -> model -> b
# A.append(b) -> B
# B(Ab) -> model -> c
# B.append(c) -> C
# C(Abc) -> model -> ...
model2.load_weights(tf.train.latest_checkpoint(output_dir))
model2.summary()

七,预测模型做预测

def generate_text(model, start_string, num_generate = 1000):
    input_eval = [char2idx[ch] for ch in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    
    text_generated = []
    model.reset_states()
    
    for _ in range(num_generate):
        # 1. model inference -> predictions
        # 2. sample -> ch -> text_generated.
        # 3. update input_eval
        
        # predictions : [batch_size, input_eval_len, vocab_size]
        predictions = model(input_eval)
        # predictions : [input_eval_len, vocab_size]
        predictions = tf.squeeze(predictions, 0)
        # predicted_ids: [input_eval_len, 1]
        # a b c -> b c d
        predicted_id = tf.random.categorical(
            predictions, num_samples = 1)[-1, 0].numpy()
        text_generated.append(idx2char[predicted_id])
        # s, x -> rnn -> s', y
        input_eval = tf.expand_dims([predicted_id], 0)
    return start_string + ''.join(text_generated)

new_text = generate_text(model2, "All: ")
print(new_text)
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值