深度学习之 LSTM via Keras

内容参照:Example of NLP via Keras LSTM
数据:

git clone https://github.com/wojzaremba/lstm
mv lstm/data .
\rm -rf lstm

添加了一些必须的import后的可执行代码(注意by default,以下代码迭代40次,需要时间比较长,TensorFlow/Keras 尽可能会使用GPU的一些计算指令,耗电量也不小):


import pathlib, os
import collections 

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, LSTM, TimeDistributed, Activation, Dense, Dropout
import numpy as np

"""
REFRENCES

https://adventuresinmachinelearning.com/keras-lstm-tutorial/
https://github.com/wojzaremba/lstm

"""

_CUR_DIR = str(pathlib.Path(__file__).parent.absolute())

class KerasBatchGenerator(object):

    def __init__(self, data, num_steps, batch_size, vocabulary, skip_step=5):
        self.data = data
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.vocabulary = vocabulary
        # this will track the progress of the batches sequentially through the
        # data set - once the data reaches the end of the data set it will reset
        # back to zero
        self.current_idx = 0
        # skip_step is the number of words which will be skipped before the next
        # batch is skimmed from the data set
        self.skip_step = skip_step
        
    def generate(self):
        x = np.zeros((self.batch_size, self.num_steps))
        y = np.zeros((self.batch_size, self.num_steps, self.vocabulary))
        while True:
            for i in range(self.batch_size):
                if self.current_idx + self.num_steps >= len(self.data):
                    # reset the index back to the start of the data set
                    self.current_idx = 0
                x[i, :] = self.data[self.current_idx:self.current_idx + self.num_steps]
                temp_y = self.data[self.current_idx + 1:self.current_idx + self.num_steps + 1]
                # convert all of temp_y into a one hot representation
                y[i, :, :] = keras.utils.to_categorical(temp_y, num_classes=self.vocabulary)
                self.current_idx += self.skip_step
            yield x, y

def _file_to_word_ids(filename, word_to_id):
  data = _read_words(filename)
  return [word_to_id[word] for word in data if word in word_to_id]

def _build_vocab(filename):
  data = _read_words(filename)

  counter = collections.Counter(data)
  count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

  words, _ = list(zip(*count_pairs))
  word_to_id = dict(zip(words, range(len(words))))
  id_to_word = dict((v, k) for k, v in word_to_id.items())

  return word_to_id, id_to_word
  
def _read_words(filename):
  with tf.io.gfile.GFile(filename, "r") as f:
    return f.read().replace("\n", "<eos>").split()

def _file_to_word_ids(filename, word_to_id):
  data = _read_words(filename)
  return [word_to_id[word] for word in data if word in word_to_id]
         
def load_data():
    # get the data paths
    train_path = os.path.join(_CUR_DIR, 'data', "ptb.train.txt")
    valid_path = os.path.join(_CUR_DIR, 'data', "ptb.valid.txt")
    test_path =  os.path.join(_CUR_DIR, 'data', "ptb.test.txt")

    # build the complete vocabulary, then convert text data to list of integers
    word_to_id = _build_vocab(train_path)
    train_data = _file_to_word_ids(train_path, word_to_id[0])
    valid_data = _file_to_word_ids(valid_path, word_to_id[0])
    test_data = _file_to_word_ids(test_path, word_to_id[0])
    vocabulary = len(word_to_id[0])
    reversed_dictionary = word_to_id[1]

    print(train_data[:5])
    print(word_to_id)
    print(vocabulary)
    print(" ".join([reversed_dictionary[x] for x in train_data[:10]]))
    return train_data, valid_data, test_data, vocabulary, reversed_dictionary

""" Model Params """
num_steps=30 
batch_size=20 
hidden_size=500
use_dropout = True
num_epochs = 40

""" Start training """     
train_data, valid_data, test_data, vocabulary, reversed_dictionary = load_data()

train_data_generator = KerasBatchGenerator(train_data, num_steps, batch_size, vocabulary,
                                           skip_step=num_steps)
valid_data_generator = KerasBatchGenerator(valid_data, num_steps, batch_size, vocabulary,
                                           skip_step=num_steps)

model = keras.Sequential()
model.add(Embedding(vocabulary, hidden_size, input_length=num_steps))
model.add(LSTM(hidden_size, return_sequences=True))
model.add(LSTM(hidden_size, return_sequences=True))

if use_dropout:
    model.add(Dropout(0.5))
model.add(TimeDistributed(Dense(vocabulary)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join(_CUR_DIR, 'output', 'model-{epoch:02d}.hdf5'), verbose=1)

model.fit_generator(train_data_generator.generate(), len(train_data)//(batch_size*num_steps), num_epochs,
                        validation_data=valid_data_generator.generate(),
                        validation_steps=len(valid_data)//(batch_size*num_steps), callbacks=[checkpointer])



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值