内容参照:Example of NLP via Keras LSTM
数据:
git clone https://github.com/wojzaremba/lstm
mv lstm/data .
\rm -rf lstm
添加了一些必须的import后的可执行代码(注意by default,以下代码迭代40次,需要时间比较长,TensorFlow/Keras 尽可能会使用GPU的一些计算指令,耗电量也不小):
import pathlib, os
import collections
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, LSTM, TimeDistributed, Activation, Dense, Dropout
import numpy as np
"""
REFRENCES
https://adventuresinmachinelearning.com/keras-lstm-tutorial/
https://github.com/wojzaremba/lstm
"""
_CUR_DIR = str(pathlib.Path(__file__).parent.absolute())
class KerasBatchGenerator(object):
def __init__(self, data, num_steps, batch_size, vocabulary, skip_step=5):
self.data = data
self.num_steps = num_steps
self.batch_size = batch_size
self.vocabulary = vocabulary
# this will track the progress of the batches sequentially through the
# data set - once the data reaches the end of the data set it will reset
# back to zero
self.current_idx = 0
# skip_step is the number of words which will be skipped before the next
# batch is skimmed from the data set
self.skip_step = skip_step
def generate(self):
x = np.zeros((self.batch_size, self.num_steps))
y = np.zeros((self.batch_size, self.num_steps, self.vocabulary))
while True:
for i in range(self.batch_size):
if self.current_idx + self.num_steps >= len(self.data):
# reset the index back to the start of the data set
self.current_idx = 0
x[i, :] = self.data[self.current_idx:self.current_idx + self.num_steps]
temp_y = self.data[self.current_idx + 1:self.current_idx + self.num_steps + 1]
# convert all of temp_y into a one hot representation
y[i, :, :] = keras.utils.to_categorical(temp_y, num_classes=self.vocabulary)
self.current_idx += self.skip_step
yield x, y
def _file_to_word_ids(filename, word_to_id):
data = _read_words(filename)
return [word_to_id[word] for word in data if word in word_to_id]
def _build_vocab(filename):
data = _read_words(filename)
counter = collections.Counter(data)
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*count_pairs))
word_to_id = dict(zip(words, range(len(words))))
id_to_word = dict((v, k) for k, v in word_to_id.items())
return word_to_id, id_to_word
def _read_words(filename):
with tf.io.gfile.GFile(filename, "r") as f:
return f.read().replace("\n", "<eos>").split()
def _file_to_word_ids(filename, word_to_id):
data = _read_words(filename)
return [word_to_id[word] for word in data if word in word_to_id]
def load_data():
# get the data paths
train_path = os.path.join(_CUR_DIR, 'data', "ptb.train.txt")
valid_path = os.path.join(_CUR_DIR, 'data', "ptb.valid.txt")
test_path = os.path.join(_CUR_DIR, 'data', "ptb.test.txt")
# build the complete vocabulary, then convert text data to list of integers
word_to_id = _build_vocab(train_path)
train_data = _file_to_word_ids(train_path, word_to_id[0])
valid_data = _file_to_word_ids(valid_path, word_to_id[0])
test_data = _file_to_word_ids(test_path, word_to_id[0])
vocabulary = len(word_to_id[0])
reversed_dictionary = word_to_id[1]
print(train_data[:5])
print(word_to_id)
print(vocabulary)
print(" ".join([reversed_dictionary[x] for x in train_data[:10]]))
return train_data, valid_data, test_data, vocabulary, reversed_dictionary
""" Model Params """
num_steps=30
batch_size=20
hidden_size=500
use_dropout = True
num_epochs = 40
""" Start training """
train_data, valid_data, test_data, vocabulary, reversed_dictionary = load_data()
train_data_generator = KerasBatchGenerator(train_data, num_steps, batch_size, vocabulary,
skip_step=num_steps)
valid_data_generator = KerasBatchGenerator(valid_data, num_steps, batch_size, vocabulary,
skip_step=num_steps)
model = keras.Sequential()
model.add(Embedding(vocabulary, hidden_size, input_length=num_steps))
model.add(LSTM(hidden_size, return_sequences=True))
model.add(LSTM(hidden_size, return_sequences=True))
if use_dropout:
model.add(Dropout(0.5))
model.add(TimeDistributed(Dense(vocabulary)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
checkpointer = keras.callbacks.ModelCheckpoint(
filepath=os.path.join(_CUR_DIR, 'output', 'model-{epoch:02d}.hdf5'), verbose=1)
model.fit_generator(train_data_generator.generate(), len(train_data)//(batch_size*num_steps), num_epochs,
validation_data=valid_data_generator.generate(),
validation_steps=len(valid_data)//(batch_size*num_steps), callbacks=[checkpointer])