import os
import numpy as np
import tensorflow as tf
def load_data(file_path):
"""
加载原始数据
:param file_path:
:return:
"""
with open(file_path, 'r') as reader:
data = reader.readlines()
return data
def create_lookup_table(text):
"""
构建字典表 {单词:数字} {数字:单词}
:param text:
:return:
"""
words = sorted(list(set(text)))
# 构建字典
word2int = {word:idx for idx,word in enumerate(words)}
int2word = dict(enumerate(words))
return word2int, int2word
def create_X_and_Y(data, word2int, number_time_steps=3):
"""
基于原始数据,构建训练数据集的 X和Y
:param data:
:param word2int:
:param number_time_steps:
:return:
"""
X, Y = [], []
for content in data:
# 得到当前文本对应的单词序列。 strip()去除前后空格
words = content.strip().split(' ')
# 获得单词总数量
words_number = len(words)
offset = 0
while offset < words_number - number_time_steps:
temp_x = words[offset: offset+number_time_steps]
temp_y = words[offset+number_time_steps]
X.append([word2int[tx] for tx in temp_x])
Y.append(word2int[temp_y])
offset +=1
# 将列表转为numpy ndarray
X = np.asarray(X).reshape([-1, number_time_steps])
Y = np.asarray(Y).reshape(-1)
return X, Y
def create_model(vocab_size, num_units=32, number_time_steps=3):
"""
:param vocab_size: 词表大小
:param num_units: 隐藏层的节点数量(神经元个数)
:param number_time_steps: 时间步
:return:
"""
with tf.variable_scope('Network', initializer=tf.truncated_normal_initializer(stddev=0.1)):
with tf.variable_scope('input'):
# 输入数据的形状
"""
x:
[[2, 3, 4],
[7, 8, 9]]
y:
[[5],
[10]],
"""
_x = tf.placeholder(tf.int32, shape=[None, number_time_steps], name='x')
_y = tf.placeholder(tf.int32, shape=[None], name='y')
_x = tf.cast(_x, tf.float32)
# 需要将原始的输入_x 按照时间步进行分割,变成列表。
# todo 用的真实的值,但实际项目中 应该 用one-hot或者embedding。
input_x = tf.split(_x, num_or_size_splits=number_time_steps, axis=1)
# [[N, 1], [N,1], ......]
with tf.variable_scope('rnn'):
# a、定义cell
cell_fw = tf.nn.rnn_cell.BasicLSTMCell(num_units=num_units)
cell_bw = tf.nn.rnn_cell.BasicLSTMCell(num_units=num_units)
# b、调用双向静态rnn 获取隐藏层输出结果
rnn_outputs, _, _ = tf.nn.static_bidirectional_rnn(
cell_fw=cell_fw, cell_bw=cell_bw, inputs=input_x, dtype=tf.float32
)
# rnn_outputs: [[N, 2*lstm_size], [N, 2*lstm_size], ....]
with tf.variable_scope('logits'):
# a、获取隐藏层最后一个时刻的输出
rnn_output = rnn_outputs[-1]
# b、构建输出层变量
softmax_w = tf.get_variable(
'w', shape=[2*num_units, vocab_size], dtype=tf.float32
)
softmax_b = tf.get_variable(
'b', shape=[vocab_size], dtype=tf.float32, initializer=tf.zeros_initializer()
)
logits = tf.nn.xw_plus_b(rnn_output, softmax_w, softmax_b)
with tf.variable_scope('Predict'):
predictions = tf.argmax(logits, axis=1)
return _x, _y, logits, predictions
def create_loss(logits, labels):
"""
创建损失
:param logits:
:param labels:
:return:
"""
with tf.name_scope('loss'):
# a\将标签转换为1维的形式
labels = tf.reshape(labels, shape=[-1])
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=logits, labels=labels
))
return loss
def create_optimizer(loss, lr=1e-3):
"""
构建优化器
:param loss:
:param lr:
:return:
"""
with tf.name_scope('optimizer'):
optimizer = tf.train.AdamOptimizer(learning_rate=lr)
train_opt = optimizer.minimize(loss)
return train_opt
def train(checkpoint_dir, max_steps=10000, batch_size=64, num_units=32, number_time_steps=10):
graph = tf.Graph()
with graph.as_default():
# 加载数据
data = load_data(file_path='../datas/belling_the_cat.txt')
text = []
for line in data:
line = line.strip()
for word in line.split(' '):
text.append(word)
word2int, int2word = create_lookup_table(text)
x, y = create_X_and_Y(data, word2int, number_time_steps=number_time_steps)
# print(word2int, '\n', int2word)
# 1、构建网络
_x, _y, logits, predictions = create_model(
len(word2int), num_units=num_units, number_time_steps=number_time_steps)
# 2、模型损失
loss = create_loss(logits, _y)
# 3、优化器
train_opt = create_optimizer(loss)
saver = tf.train.Saver()
with tf.Session(graph=graph) as sess:
sess.run(tf.global_variables_initializer())
# 构建迭代数据
total_samples = x.shape[0]
n_batches = total_samples // batch_size
time = 0
# 返回一个随机打乱下标的 array, 功能就是shuffle
random_index = np.random.permutation(total_samples)
for step in range(1, max_steps):
# 获取当前批量的训练数据
start_idx = time * batch_size
end_idx = start_idx + batch_size
idx = random_index[start_idx: end_idx]
train_x = x[idx]
train_y = y[idx]
# 构建输入数据对象
feed = {_x: train_x, _y: train_y}
sess.run(train_opt, feed)
if step % 200==0:
train_loss = sess.run(loss, feed)
print('step:{} - Train loss:{}'.format(step, train_loss))
# 做一个预测的
index = np.random.randint(low=0, high=total_samples)
sample_in = np.reshape(x[index], newshape=[-1, number_time_steps])
sample_out = sess.run(predictions, feed_dict={_x: sample_in})
print('输入:{} - 预测:{} VS 真实值:{}'.format(
x[index], int2word[sample_out[0]], int2word[y[index]]))
if step % 1000 == 0:
# 模型持久化
files = 'model.ckpt'
save_files = os.path.join(checkpoint_dir, files)
saver.save(sess, save_path=save_files, global_step=step)
print('model saved!!')
# 更新样本顺序的
time += 1
if time == n_batches:
time =0
random_index = np.random.permutation(total_samples)
if __name__ == '__main__':
checkpoint_dir = './models'
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
train(
checkpoint_dir, max_steps=10000, batch_size=64, num_units=32, number_time_steps=10
)
07-23
07-23
07-23
07-23
07-23
07-23
07-23