input_texts 代表每句英文,存储形式(数组)如
['Go.', 'Hi.', 'Run!', 'Run!', 'Wow!', 'Fire!',...,, 'Help me, please.']
input_characters 代表英文中的每个字符,存储形式(集合)如
{'V', "'", 'e', 'I', ' ', 'a', 'G', 'U', '9', 'g', 'n', 'h', '$', 'y', 'M', 'R', 'c', 'f', 'l', 'o', ':', 'm', 'F', 'w', 'z', '5', 'S', '%', 'k', '&', 'L', 'r', 'A', 'D', 'p', 'C', ',', 'K', 'x', '0', 's', 'J', '!', 'O', 'B', 'E', 't', 'H', '-', 'i', 'N', 'Q', '7', 'v', 'b', 'T', 'd', 'q', '8', 'j', 'P', '1', 'Y', 'u', '3', '6', '?', 'W', '.'}
output_text 代表每句法文,存在存储形式(数组)如
['\tVa !\n', '\tSalut !\n', '\tCours\u202f!\n', '\tCourez\u202f!\n',...,, "\tAide-moi, s'il te plait.\n"]
output_characters 代表法文中的每个字符,存储形式(集合)如
{'\xa0', 'J', 'z', ')', 'P', 'o', '0', '?', 'l', 'I', 'e', '&', '1', 'É', '.', 'O', 'g', 's', 'b', '$', '3', ',', 'a', 'U', 'h', 'c', 'C', 'ï', '\t', '\u202f', 'é', 'T', 'à', 'E', 'Ê', 'î', 'R', 'À', 'x', 'Ç', 'V', "'", 'K', 'B', 'y', 'œ', 'r', 'i', 'L', 'j', 'ë', '«', '\n', '9', 'ù', '8', 'G', 'H', 'd', '’', 'â', 'A', '%', 'w', 't', '5', 'f', '»', 'è', 'ê', 'p', 'F', 'Q', 'm', 'N', ':', '-', 'û', 'ç', 'n', 'D', 'ô', '\u2009', 'k', 'Y', 'S', 'q', '!', ' ', 'u', 'v', 'M', '('}
所有数据的情况如下图
数据集可以从 http://www.manythings.org/anki/fra-eng.zip 进行下载
训练模型的代码如下,生成.h5格式的代码,其中 s2s.h5 我没有使用
注意,此处我生成了两个模型并进行了保存,encoder_model.h5 和 decoder_model.h5
训练代数最好不要改,epoch=100在我的电脑上需要等待一段时间,但最终预测的效果很好,减少代数可能结果变差。
# -*- coding: utf-8 -*- from keras.models import Model from keras.layers import Input, LSTM, Dense import numpy as np batch_size = 64 # Batch size for training. epochs = 100 # Number of epochs to train for. latent_dim = 256 # Latent dimensionality of the encoding space.隐含层 num_samples = 10000 # Number of samples to train on.样本总数 # Path to the data txt file on disk. data_path = 'fra.txt' # Vectorize the data. input_texts = [] target_texts = [] input_characters = set() target_characters = set() with open(data_path, 'r', encoding='utf-8') as f: lines = f.read().split('\n')#以回车划分每行 for line in lines[: min(num_samples, len(lines) - 1)]: input_text, target_text = line.split('\t')#以tab划分英法文 # We use "tab" as the "start sequence" character # for the targets, and "\n" as "end sequence" character. target_text = '\t' + target_text + '\n'#对于法文输出,以\t为开始,以\n为结束 input_texts.append(input_text) target_texts.append(target_text) for char in input_text: if char not in input_characters: input_characters.add(char) for char in target_text: if char not in target_characters: target_characters.add(char) input_characters = sorted(list(input_characters)) target_characters = sorted(list(target_characters)) num_encoder_tokens = len(input_characters) num_decoder_tokens = len(target_characters) max_encoder_seq_length = max([len(txt) for txt in input_texts]) max_decoder_seq_length = max([len(txt) for txt in target_texts]) # print('Number of samples:', len(input_texts))#10000 # print('Number of unique input tokens:', num_encoder_tokens)#69 # print('Number of unique output tokens:', num_decoder_tokens)#93 # print('Max sequence length for inputs:', max_encoder_seq_length)#16 # print('Max sequence length for outputs:', max_decoder_seq_length)#59 # 建立 字符->数字 字典,用于字符的向量化 input_token_index = dict( [(char, i) for i, char in enumerate(input_characters)])#给输入的每个字符标索引,从0~68 target_token_index = dict( [(char, i) for i, char in enumerate(target_characters)])##给输出的每个字符标索引,从0~92 # 创建数组 encoder_input_data = np.zeros( (len(input_texts), max_encoder_seq_length, num_encoder_tokens),#(10000, 16, 69) dtype='float32') decoder_input_data = np.zeros( (len(input_texts), max_decoder_seq_length, num_decoder_tokens),#(10000, 59, 93) dtype='float32') decoder_target_data = np.zeros( (len(input_texts), max_decoder_seq_length, num_decoder_tokens),#(10000, 59, 93) dtype='float32') # 填充数据, 对每一个字符做one-hot for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)): # 对编码器的输入序列做one-hot for t, char in enumerate(input_text): encoder_input_data[i, t, input_token_index[char]] = 1. # 对解码器的输入与输出做序列做one-hot for t, char in enumerate(target_text): # decoder_target_data is ahead of decoder_input_data by one timestep decoder_input_data[i, t, target_token_index[char]] = 1. if t > 0: # decoder_target_data 不包含开始字符,并且比decoder_input_data提前一步 decoder_target_data[i, t - 1, target_token_index[char]] = 1. #print(decoder_target_data.shape)#(10000, 59, 93) # 定义编码器的输入 # encoder_inputs (None, num_encoder_tokens), None表示可以处理任意长度的序列#69 encoder_inputs = Input(shape=(None, num_encoder_tokens)) # 编码器,要求其返回状态 encoder = LSTM(latent_dim, return_state=True)#256 # 调用编码器,得到编码器的输出(输入其实不需要),以及状态信息 state_h 和 state_c encoder_outputs, state_h, state_c = encoder(encoder_inputs) # 丢弃encoder_outputs, 我们只需要编码器的状态 encoder_states = [state_h, state_c] # 定义解码器的输入 # 同样的,None表示可以处理任意长度的序列 decoder_inputs = Input(shape=(None, num_decoder_tokens)) # 接下来建立解码器,解码器将返回整个输出序列 # 并且返回其中间状态,中间状态在训练阶段不会用到,但是在推理阶段将是有用的 decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True) # 将编码器输出的状态作为初始解码器的初始状态 decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states) # 添加全连接层 decoder_dense = Dense(num_decoder_tokens, activation='softmax') decoder_outputs = decoder_dense(decoder_outputs) # 定义整个模型 model = Model([encoder_inputs, decoder_inputs], decoder_outputs) # Run training model.compile(optimizer='rmsprop', loss='categorical_crossentropy') model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=batch_size, epochs=epochs, validation_split=0.2) # Save model model.save('s2s.h5') # Next: inference mode (sampling). # Here's the drill: # 1) encode input and retrieve initial decoder state # 2) run one step of decoder with this initial state # and a "start of sequence" token as target. # Output will be the next target token # 3) Repeat with the current target token and current states # Define sampling models encoder_model = Model(encoder_inputs, encoder_states) encoder_model.save('encoder_model.h5') decoder_state_input_h = Input(shape=(latent_dim,)) decoder_state_input_c = Input(shape=(latent_dim,)) decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c] decoder_outputs, state_h, state_c = decoder_lstm( decoder_inputs, initial_state=decoder_states_inputs) decoder_states = [state_h, state_c] decoder_outputs = decoder_dense(decoder_outputs) decoder_model = Model( [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states) decoder_model.save('decoder_model.h5')
两个.h5文件为存储到的模型
预测阶段的代码如下所示,只需要调用刚才需要的两个模型便可进行预测。
# -*- coding: utf-8 -*-
from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np
from keras.models import load_model
# Next: inference mode (sampling).
# Here's the drill:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
# and a "start of sequence" token as target.
# Output will be the next target token
# 3) Repeat with the current target token and current states
# Define sampling models
batch_size = 64 # Batch size for training.
epochs = 100 # Number of epochs to train for.
latent_dim = 256 # Latent dimensionality of the encoding space.
num_samples = 10000 # Number of samples to train on.
# Path to the data txt file on disk.
data_path = 'fra.txt'
# Vectorize the data.
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
with open(data_path, 'r', encoding='utf-8') as f:
lines = f.read().split('\n')#以回车划分每行
for line in lines[: min(num_samples, len(lines) - 1)]:
input_text, target_text = line.split('\t')#以tab划分英法文
# We use "tab" as the "start sequence" character
# for the targets, and "\n" as "end sequence" character.
target_text = '\t' + target_text + '\n'#对于法文输出,以\t为开始,以\n为结束
input_texts.append(input_text)
target_texts.append(target_text)
for char in input_text:
if char not in input_characters:
input_characters.add(char)
for char in target_text:
if char not in target_characters:
target_characters.add(char)
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])
# print('Number of samples:', len(input_texts))#10000
# print('Number of unique input tokens:', num_encoder_tokens)#69
# print('Number of unique output tokens:', num_decoder_tokens)#93
# print('Max sequence length for inputs:', max_encoder_seq_length)#16
# print('Max sequence length for outputs:', max_decoder_seq_length)#59
# 建立 字符->数字 字典,用于字符的向量化
input_token_index = dict(
[(char, i) for i, char in enumerate(input_characters)])#给输入的每个字符标索引,从0~68
target_token_index = dict(
[(char, i) for i, char in enumerate(target_characters)])##给输出的每个字符标索引,从0~92
# 创建数组
encoder_input_data = np.zeros(
(len(input_texts), max_encoder_seq_length, num_encoder_tokens),#(10000, 16, 69)
dtype='float32')
decoder_input_data = np.zeros(
(len(input_texts), max_decoder_seq_length, num_decoder_tokens),#(10000, 59, 93)
dtype='float32')
decoder_target_data = np.zeros(
(len(input_texts), max_decoder_seq_length, num_decoder_tokens),#(10000, 59, 93)
dtype='float32')
# 填充数据, 对每一个字符做one-hot
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
# 对编码器的输入序列做one-hot
for t, char in enumerate(input_text):
encoder_input_data[i, t, input_token_index[char]] = 1.
# 对解码器的输入与输出做序列做one-hot
for t, char in enumerate(target_text):
# decoder_target_data is ahead of decoder_input_data by one timestep
decoder_input_data[i, t, target_token_index[char]] = 1.
if t > 0:
# decoder_target_data 不包含开始字符,并且比decoder_input_data提前一步
decoder_target_data[i, t - 1, target_token_index[char]] = 1.
#print(decoder_target_data.shape)#(10000, 59, 93)
encoder_model = load_model('encoder_model.h5')
decoder_model = load_model('decoder_model.h5')
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
(i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
(i, char) for char, i in target_token_index.items())
def decode_sequence(input_seq):
# Encode the input as state vectors.
states_value = encoder_model.predict(input_seq)
# Generate empty target sequence of length 1.
target_seq = np.zeros((1, 1, num_decoder_tokens))
# Populate the first character of target sequence with the start character.
target_seq[0, 0, target_token_index['\t']] = 1.
# Sampling loop for a batch of sequences
# (to simplify, here we assume a batch of size 1).
stop_condition = False
decoded_sentence = ''
while not stop_condition:
output_tokens, h, c = decoder_model.predict(
[target_seq] + states_value)
# Sample a token
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_char = reverse_target_char_index[sampled_token_index]
decoded_sentence += sampled_char
# Exit condition: either hit max length
# or find stop character.
if (sampled_char == '\n' or
len(decoded_sentence) > max_decoder_seq_length):
stop_condition = True
# Update the target sequence (of length 1).
target_seq = np.zeros((1, 1, num_decoder_tokens))
target_seq[0, 0, sampled_token_index] = 1.
# Update states
states_value = [h, c]
return decoded_sentence
for seq_index in range(10):
# Take one sequence (part of the training set)
# for trying out decoding.
input_seq = encoder_input_data[seq_index: seq_index + 1]
decoded_sentence = decode_sequence(input_seq)
print('-')
print('Input sentence:', input_texts[seq_index])
print('Decoded sentence:', decoded_sentence)
我只打印两个结果,效果还行!